diff mbox

[5/5] btrfs: Verify every chunk has corresponding block group at mount time

Message ID 20180703091009.16399-6-wqu@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Qu Wenruo July 3, 2018, 9:10 a.m. UTC
If a crafted btrfs has missing block group items, it could cause
unexpected behavior and breaks our expectation on 1:1
chunk<->block group mapping.

Although we added block group -> chunk mapping check, we still need
chunk -> block group mapping check.

This patch will do extra check to ensure each chunk has its
corresponding block group.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
Reported-by: Xu Wen <wen.xu@gatech.edu>
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

Comments

Gu Jinxiang July 4, 2018, 6:09 a.m. UTC | #1
> -----Original Message-----

> From: linux-btrfs-owner@vger.kernel.org [mailto:linux-btrfs-owner@vger.kernel.org] On Behalf Of Qu Wenruo

> Sent: Tuesday, July 03, 2018 5:10 PM

> To: linux-btrfs@vger.kernel.org

> Subject: [PATCH 5/5] btrfs: Verify every chunk has corresponding block group at mount time

> 

> If a crafted btrfs has missing block group items, it could cause

> unexpected behavior and breaks our expectation on 1:1

> chunk<->block group mapping.

> 

> Although we added block group -> chunk mapping check, we still need

> chunk -> block group mapping check.

> 

> This patch will do extra check to ensure each chunk has its

> corresponding block group.

> 

> Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847

> Reported-by: Xu Wen <wen.xu@gatech.edu>

> Signed-off-by: Qu Wenruo <wqu@suse.com>

> ---

>  fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-

>  1 file changed, 51 insertions(+), 1 deletion(-)

> 

> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

> index 82b446f014b9..746095034ca2 100644

> --- a/fs/btrfs/extent-tree.c

> +++ b/fs/btrfs/extent-tree.c

> @@ -10038,6 +10038,56 @@ static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,

>  	return ret;

>  }

> 

> +/*

> + * Iterate all chunks and verify each of them has corresponding block group

> + */

> +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)

> +{

> +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;

> +	struct extent_map *em;

> +	struct btrfs_block_group_cache *bg;

> +	u64 start = 0;

> +	int ret = 0;

> +

> +	while (1) {

> +		read_lock(&map_tree->map_tree.lock);

> +		em = lookup_extent_mapping(&map_tree->map_tree, start,

> +					   (u64)-1 - start);

> +		read_unlock(&map_tree->map_tree.lock);

> +		if (!em)

> +			break;

> +

> +		bg = btrfs_lookup_block_group(fs_info, em->start);

> +		if (!bg) {

> +			btrfs_err_rl(fs_info,

> +	"chunk start=%llu len=%llu doesn't have corresponding block group",

> +				     em->start, em->len);

> +			ret = -ENOENT;

> +			free_extent_map(em);

> +			break;

> +		}

> +		if (bg->key.objectid != em->start ||

> +		    bg->key.offset != em->len ||

> +		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=

> +		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {

> +			btrfs_err_rl(fs_info,

> +"chunk start=%llu len=%llu flags=0x%llx doesn't match with block group start=%llu len=%llu flags=0x%llx",

> +				em->start, em->len,

> +				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,

> +				bg->key.objectid, bg->key.offset,

> +				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);

> +			ret = -EUCLEAN;

> +			free_extent_map(em);

> +			btrfs_put_block_group(bg);

> +			break;

> +		}

> +		start = em->start + em->len;

> +		free_extent_map(em);

> +		btrfs_put_block_group(bg);

> +	}

> +	return ret;

> +}

> +

>  int btrfs_read_block_groups(struct btrfs_fs_info *info)

>  {

>  	struct btrfs_path *path;

> @@ -10227,7 +10277,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)

> 

>  	btrfs_add_raid_kobjects(info);

>  	init_global_block_rsv(info);

> -	ret = 0;

> +	ret = check_chunk_block_group_mappings(info);

>  error:

>  	btrfs_free_path(path);

>  	return ret;

> --


Reviewed-by: Gu Jinxiang <gujx@cn.fujitsu.com>


> 2.18.0

> 

> --

> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in

> the body of a message to majordomo@vger.kernel.org

> More majordomo info at  http://vger.kernel.org/majordomo-info.html

>
Nikolay Borisov July 4, 2018, 7:08 a.m. UTC | #2
On  3.07.2018 12:10, Qu Wenruo wrote:
> If a crafted btrfs has missing block group items, it could cause
> unexpected behavior and breaks our expectation on 1:1
> chunk<->block group mapping.
> 
> Although we added block group -> chunk mapping check, we still need
> chunk -> block group mapping check.
> 
> This patch will do extra check to ensure each chunk has its
> corresponding block group.
> 
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
> Reported-by: Xu Wen <wen.xu@gatech.edu>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 51 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 82b446f014b9..746095034ca2 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -10038,6 +10038,56 @@ static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,
>  	return ret;
>  }
>  
> +/*
> + * Iterate all chunks and verify each of them has corresponding block group
> + */
> +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
> +	struct extent_map *em;
> +	struct btrfs_block_group_cache *bg;
> +	u64 start = 0;
> +	int ret = 0;
> +
> +	while (1) {
> +		read_lock(&map_tree->map_tree.lock);
> +		em = lookup_extent_mapping(&map_tree->map_tree, start,
> +					   (u64)-1 - start);
len parameter of lookup_extent_mapping eventually ends up in range_end.
Meaning it will just return -1. Why not use just -1 for len. Looking at
the rest of the code this seems to be the convention. But then there are
several places where 1 is passed as well. Hm, in any case a single
number is simpler than an expression.

> +		read_unlock(&map_tree->map_tree.lock);
> +		if (!em)
> +			break;
> +
> +		bg = btrfs_lookup_block_group(fs_info, em->start);
> +		if (!bg) {
> +			btrfs_err_rl(fs_info,
> +	"chunk start=%llu len=%llu doesn't have corresponding block group",
> +				     em->start, em->len);
> +			ret = -ENOENT;
> +			free_extent_map(em);
> +			break;
> +		}
> +		if (bg->key.objectid != em->start ||
> +		    bg->key.offset != em->len ||
> +		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
> +		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
> +			btrfs_err_rl(fs_info,
> +"chunk start=%llu len=%llu flags=0x%llx doesn't match with block group start=%llu len=%llu flags=0x%llx",
> +				em->start, em->len,
> +				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
> +				bg->key.objectid, bg->key.offset,
> +				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
> +			ret = -EUCLEAN;
> +			free_extent_map(em);
> +			btrfs_put_block_group(bg);
> +			break;
> +		}
> +		start = em->start + em->len;
> +		free_extent_map(em);
> +		btrfs_put_block_group(bg);
> +	}
> +	return ret;
> +}
> +
>  int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  {
>  	struct btrfs_path *path;
> @@ -10227,7 +10277,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  
>  	btrfs_add_raid_kobjects(info);
>  	init_global_block_rsv(info);
> -	ret = 0;
> +	ret = check_chunk_block_group_mappings(info);

Rather than doing that can we just get the count of chunks. Then if we
have as many chunks as BG have been read in and we know the BG -> chunk
mapping check has passed we can assume that chunks also map to BG
without going through all chunks.

>  error:
>  	btrfs_free_path(path);
>  	return ret;
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo July 4, 2018, 9:46 a.m. UTC | #3
On 2018年07月04日 15:08, Nikolay Borisov wrote:
> 
> 
> On  3.07.2018 12:10, Qu Wenruo wrote:
>> If a crafted btrfs has missing block group items, it could cause
>> unexpected behavior and breaks our expectation on 1:1
>> chunk<->block group mapping.
>>
>> Although we added block group -> chunk mapping check, we still need
>> chunk -> block group mapping check.
>>
>> This patch will do extra check to ensure each chunk has its
>> corresponding block group.
>>
>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
>> Reported-by: Xu Wen <wen.xu@gatech.edu>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>  fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 51 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index 82b446f014b9..746095034ca2 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -10038,6 +10038,56 @@ static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,
>>  	return ret;
>>  }
>>  
>> +/*
>> + * Iterate all chunks and verify each of them has corresponding block group
>> + */
>> +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
>> +{
>> +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
>> +	struct extent_map *em;
>> +	struct btrfs_block_group_cache *bg;
>> +	u64 start = 0;
>> +	int ret = 0;
>> +
>> +	while (1) {
>> +		read_lock(&map_tree->map_tree.lock);
>> +		em = lookup_extent_mapping(&map_tree->map_tree, start,
>> +					   (u64)-1 - start);
> len parameter of lookup_extent_mapping eventually ends up in range_end.
> Meaning it will just return -1. Why not use just -1 for len. Looking at
> the rest of the code this seems to be the convention. But then there are
> several places where 1 is passed as well. Hm, in any case a single
> number is simpler than an expression.

I still like to be accurate here, since it's @len, then we should follow
its naming.
Although we have range_end() for correct careless caller, it still
doesn't sound good just passing -1 as @len.

> 
>> +		read_unlock(&map_tree->map_tree.lock);
>> +		if (!em)
>> +			break;
>> +
>> +		bg = btrfs_lookup_block_group(fs_info, em->start);
>> +		if (!bg) {
>> +			btrfs_err_rl(fs_info,
>> +	"chunk start=%llu len=%llu doesn't have corresponding block group",
>> +				     em->start, em->len);
>> +			ret = -ENOENT;
>> +			free_extent_map(em);
>> +			break;
>> +		}
>> +		if (bg->key.objectid != em->start ||
>> +		    bg->key.offset != em->len ||
>> +		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
>> +		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
>> +			btrfs_err_rl(fs_info,
>> +"chunk start=%llu len=%llu flags=0x%llx doesn't match with block group start=%llu len=%llu flags=0x%llx",
>> +				em->start, em->len,
>> +				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
>> +				bg->key.objectid, bg->key.offset,
>> +				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
>> +			ret = -EUCLEAN;
>> +			free_extent_map(em);
>> +			btrfs_put_block_group(bg);
>> +			break;
>> +		}
>> +		start = em->start + em->len;
>> +		free_extent_map(em);
>> +		btrfs_put_block_group(bg);
>> +	}
>> +	return ret;
>> +}
>> +
>>  int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>  {
>>  	struct btrfs_path *path;
>> @@ -10227,7 +10277,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>  
>>  	btrfs_add_raid_kobjects(info);
>>  	init_global_block_rsv(info);
>> -	ret = 0;
>> +	ret = check_chunk_block_group_mappings(info);
> 
> Rather than doing that can we just get the count of chunks. Then if we
> have as many chunks as BG have been read in and we know the BG -> chunk
> mapping check has passed we can assume that chunks also map to BG
> without going through all chunks.

Nope, just as the checks done in that function, we must ensure not only
the number of bgs/chunks matches, but *each* chunk must have a block
group with the same size, length and type flags.

If we have a block group doesn't match its size/length, it's pretty
possible that the corrupted block group may overlap with other block
groups, causing undefined behavior.
So the same for type flags.

This means the only reliable check is the one used in this and previous
check.
(Check bg->chunk matches, and then check chunk->bg matches, using size +
len + type flags as material)

Thanks,
Qu

> 
>>  error:
>>  	btrfs_free_path(path);
>>  	return ret;
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba July 5, 2018, 3:18 p.m. UTC | #4
On Tue, Jul 03, 2018 at 05:10:09PM +0800, Qu Wenruo wrote:
> If a crafted btrfs has missing block group items, it could cause
> unexpected behavior and breaks our expectation on 1:1
> chunk<->block group mapping.
> 
> Although we added block group -> chunk mapping check, we still need
> chunk -> block group mapping check.
> 
> This patch will do extra check to ensure each chunk has its
> corresponding block group.
> 
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
> Reported-by: Xu Wen <wen.xu@gatech.edu>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 51 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 82b446f014b9..746095034ca2 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -10038,6 +10038,56 @@ static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,
>  	return ret;
>  }
>  
> +/*
> + * Iterate all chunks and verify each of them has corresponding block group
> + */
> +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
> +	struct extent_map *em;
> +	struct btrfs_block_group_cache *bg;
> +	u64 start = 0;
> +	int ret = 0;
> +
> +	while (1) {
> +		read_lock(&map_tree->map_tree.lock);
> +		em = lookup_extent_mapping(&map_tree->map_tree, start,
> +					   (u64)-1 - start);

This needs a comment.

> +		read_unlock(&map_tree->map_tree.lock);
> +		if (!em)
> +			break;
> +
> +		bg = btrfs_lookup_block_group(fs_info, em->start);
> +		if (!bg) {
> +			btrfs_err_rl(fs_info,
> +	"chunk start=%llu len=%llu doesn't have corresponding block group",
> +				     em->start, em->len);
> +			ret = -ENOENT;

-EUCLEAN ?

> +			free_extent_map(em);
> +			break;
> +		}
> +		if (bg->key.objectid != em->start ||
> +		    bg->key.offset != em->len ||
> +		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
> +		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
> +			btrfs_err_rl(fs_info,

Why is this ratelmited? I'd understand that a fuzzed image will spew a
lot of these errors but for a normal case, it should be ok to print all
the messages.

> +"chunk start=%llu len=%llu flags=0x%llx doesn't match with block group start=%llu len=%llu flags=0x%llx",
> +				em->start, em->len,
> +				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
> +				bg->key.objectid, bg->key.offset,
> +				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
> +			ret = -EUCLEAN;
> +			free_extent_map(em);
> +			btrfs_put_block_group(bg);
> +			break;
> +		}
> +		start = em->start + em->len;
> +		free_extent_map(em);
> +		btrfs_put_block_group(bg);
> +	}
> +	return ret;
> +}
> +
>  int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  {
>  	struct btrfs_path *path;
> @@ -10227,7 +10277,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  
>  	btrfs_add_raid_kobjects(info);
>  	init_global_block_rsv(info);
> -	ret = 0;
> +	ret = check_chunk_block_group_mappings(info);
>  error:
>  	btrfs_free_path(path);
>  	return ret;
> -- 
> 2.18.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo July 5, 2018, 11:44 p.m. UTC | #5
On 2018年07月05日 23:18, David Sterba wrote:
> On Tue, Jul 03, 2018 at 05:10:09PM +0800, Qu Wenruo wrote:
>> If a crafted btrfs has missing block group items, it could cause
>> unexpected behavior and breaks our expectation on 1:1
>> chunk<->block group mapping.
>>
>> Although we added block group -> chunk mapping check, we still need
>> chunk -> block group mapping check.
>>
>> This patch will do extra check to ensure each chunk has its
>> corresponding block group.
>>
>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
>> Reported-by: Xu Wen <wen.xu@gatech.edu>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>  fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 51 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index 82b446f014b9..746095034ca2 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -10038,6 +10038,56 @@ static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,
>>  	return ret;
>>  }
>>  
>> +/*
>> + * Iterate all chunks and verify each of them has corresponding block group
>> + */
>> +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
>> +{
>> +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
>> +	struct extent_map *em;
>> +	struct btrfs_block_group_cache *bg;
>> +	u64 start = 0;
>> +	int ret = 0;
>> +
>> +	while (1) {
>> +		read_lock(&map_tree->map_tree.lock);
>> +		em = lookup_extent_mapping(&map_tree->map_tree, start,
>> +					   (u64)-1 - start);
> 
> This needs a comment.

For the @len part?

> 
>> +		read_unlock(&map_tree->map_tree.lock);
>> +		if (!em)
>> +			break;
>> +
>> +		bg = btrfs_lookup_block_group(fs_info, em->start);
>> +		if (!bg) {
>> +			btrfs_err_rl(fs_info,
>> +	"chunk start=%llu len=%llu doesn't have corresponding block group",
>> +				     em->start, em->len);
>> +			ret = -ENOENT;
> 
> -EUCLEAN ?

Either works for me.

> 
>> +			free_extent_map(em);
>> +			break;
>> +		}
>> +		if (bg->key.objectid != em->start ||
>> +		    bg->key.offset != em->len ||
>> +		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
>> +		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
>> +			btrfs_err_rl(fs_info,
> 
> Why is this ratelmited? I'd understand that a fuzzed image will spew a
> lot of these errors but for a normal case, it should be ok to print all
> the messages.

Well, even for fuzzed image it won't trigger twice, the first time it
triggers we will error our, so indeed no need to rate the limit anyway.

Thanks,
Qu

> 
>> +"chunk start=%llu len=%llu flags=0x%llx doesn't match with block group start=%llu len=%llu flags=0x%llx",
>> +				em->start, em->len,
>> +				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
>> +				bg->key.objectid, bg->key.offset,
>> +				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
>> +			ret = -EUCLEAN;
>> +			free_extent_map(em);
>> +			btrfs_put_block_group(bg);
>> +			break;
>> +		}
>> +		start = em->start + em->len;
>> +		free_extent_map(em);
>> +		btrfs_put_block_group(bg);
>> +	}
>> +	return ret;
>> +}
>> +
>>  int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>  {
>>  	struct btrfs_path *path;
>> @@ -10227,7 +10277,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>  
>>  	btrfs_add_raid_kobjects(info);
>>  	init_global_block_rsv(info);
>> -	ret = 0;
>> +	ret = check_chunk_block_group_mappings(info);
>>  error:
>>  	btrfs_free_path(path);
>>  	return ret;
>> -- 
>> 2.18.0
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo July 5, 2018, 11:49 p.m. UTC | #6
On 2018年07月04日 17:46, Qu Wenruo wrote:
> 
> 
> On 2018年07月04日 15:08, Nikolay Borisov wrote:
>>
>>
>> On  3.07.2018 12:10, Qu Wenruo wrote:
>>> If a crafted btrfs has missing block group items, it could cause
>>> unexpected behavior and breaks our expectation on 1:1
>>> chunk<->block group mapping.
>>>
>>> Although we added block group -> chunk mapping check, we still need
>>> chunk -> block group mapping check.
>>>
>>> This patch will do extra check to ensure each chunk has its
>>> corresponding block group.
>>>
>>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
>>> Reported-by: Xu Wen <wen.xu@gatech.edu>
>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>> ---
>>>  fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 51 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>>> index 82b446f014b9..746095034ca2 100644
>>> --- a/fs/btrfs/extent-tree.c
>>> +++ b/fs/btrfs/extent-tree.c
>>> @@ -10038,6 +10038,56 @@ static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,
>>>  	return ret;
>>>  }
>>>  
>>> +/*
>>> + * Iterate all chunks and verify each of them has corresponding block group
>>> + */
>>> +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
>>> +{
>>> +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
>>> +	struct extent_map *em;
>>> +	struct btrfs_block_group_cache *bg;
>>> +	u64 start = 0;
>>> +	int ret = 0;
>>> +
>>> +	while (1) {
>>> +		read_lock(&map_tree->map_tree.lock);
>>> +		em = lookup_extent_mapping(&map_tree->map_tree, start,
>>> +					   (u64)-1 - start);
>> len parameter of lookup_extent_mapping eventually ends up in range_end.
>> Meaning it will just return -1. Why not use just -1 for len. Looking at
>> the rest of the code this seems to be the convention. But then there are
>> several places where 1 is passed as well. Hm, in any case a single
>> number is simpler than an expression.
> 
> I still like to be accurate here, since it's @len, then we should follow
> its naming.
> Although we have range_end() for correct careless caller, it still
> doesn't sound good just passing -1 as @len.
> 
>>
>>> +		read_unlock(&map_tree->map_tree.lock);
>>> +		if (!em)
>>> +			break;
>>> +
>>> +		bg = btrfs_lookup_block_group(fs_info, em->start);
>>> +		if (!bg) {
>>> +			btrfs_err_rl(fs_info,
>>> +	"chunk start=%llu len=%llu doesn't have corresponding block group",
>>> +				     em->start, em->len);
>>> +			ret = -ENOENT;
>>> +			free_extent_map(em);
>>> +			break;
>>> +		}
>>> +		if (bg->key.objectid != em->start ||
>>> +		    bg->key.offset != em->len ||
>>> +		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
>>> +		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
>>> +			btrfs_err_rl(fs_info,
>>> +"chunk start=%llu len=%llu flags=0x%llx doesn't match with block group start=%llu len=%llu flags=0x%llx",
>>> +				em->start, em->len,
>>> +				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
>>> +				bg->key.objectid, bg->key.offset,
>>> +				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
>>> +			ret = -EUCLEAN;
>>> +			free_extent_map(em);
>>> +			btrfs_put_block_group(bg);
>>> +			break;
>>> +		}
>>> +		start = em->start + em->len;
>>> +		free_extent_map(em);
>>> +		btrfs_put_block_group(bg);
>>> +	}
>>> +	return ret;
>>> +}
>>> +
>>>  int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>  {
>>>  	struct btrfs_path *path;
>>> @@ -10227,7 +10277,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>  
>>>  	btrfs_add_raid_kobjects(info);
>>>  	init_global_block_rsv(info);
>>> -	ret = 0;
>>> +	ret = check_chunk_block_group_mappings(info);
>>
>> Rather than doing that can we just get the count of chunks. Then if we
>> have as many chunks as BG have been read in and we know the BG -> chunk
>> mapping check has passed we can assume that chunks also map to BG
>> without going through all chunks.
> 
> Nope, just as the checks done in that function, we must ensure not only
> the number of bgs/chunks matches, but *each* chunk must have a block
> group with the same size, length and type flags.

Thanks to Gu's comment, there in find_first_block() we have already done
extra check to ensure every block group we're adding has a corresponding
chunk, thus just doing chunk/bg counting should be able to detect
missing block groups.

I'll try this method to reduce unnecessary block group lookup in next
version.

Thanks,
Qu

> 
> If we have a block group doesn't match its size/length, it's pretty
> possible that the corrupted block group may overlap with other block
> groups, causing undefined behavior.
> So the same for type flags.
> 
> This means the only reliable check is the one used in this and previous
> check.
> (Check bg->chunk matches, and then check chunk->bg matches, using size +
> len + type flags as material)
> 
> Thanks,
> Qu
> 
>>
>>>  error:
>>>  	btrfs_free_path(path);
>>>  	return ret;
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba July 16, 2018, 1:16 p.m. UTC | #7
On Fri, Jul 06, 2018 at 07:44:37AM +0800, Qu Wenruo wrote:
> 
> 
> On 2018年07月05日 23:18, David Sterba wrote:
> > On Tue, Jul 03, 2018 at 05:10:09PM +0800, Qu Wenruo wrote:
> >> If a crafted btrfs has missing block group items, it could cause
> >> unexpected behavior and breaks our expectation on 1:1
> >> chunk<->block group mapping.
> >>
> >> Although we added block group -> chunk mapping check, we still need
> >> chunk -> block group mapping check.
> >>
> >> This patch will do extra check to ensure each chunk has its
> >> corresponding block group.
> >>
> >> Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
> >> Reported-by: Xu Wen <wen.xu@gatech.edu>
> >> Signed-off-by: Qu Wenruo <wqu@suse.com>
> >> ---
> >>  fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-
> >>  1 file changed, 51 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> >> index 82b446f014b9..746095034ca2 100644
> >> --- a/fs/btrfs/extent-tree.c
> >> +++ b/fs/btrfs/extent-tree.c
> >> @@ -10038,6 +10038,56 @@ static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,
> >>  	return ret;
> >>  }
> >>  
> >> +/*
> >> + * Iterate all chunks and verify each of them has corresponding block group
> >> + */
> >> +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
> >> +{
> >> +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
> >> +	struct extent_map *em;
> >> +	struct btrfs_block_group_cache *bg;
> >> +	u64 start = 0;
> >> +	int ret = 0;
> >> +
> >> +	while (1) {
> >> +		read_lock(&map_tree->map_tree.lock);
> >> +		em = lookup_extent_mapping(&map_tree->map_tree, start,
> >> +					   (u64)-1 - start);
> > 
> > This needs a comment.
> 
> For the @len part?

Yes, for the expression how it's calculated.

> > 
> >> +		read_unlock(&map_tree->map_tree.lock);
> >> +		if (!em)
> >> +			break;
> >> +
> >> +		bg = btrfs_lookup_block_group(fs_info, em->start);
> >> +		if (!bg) {
> >> +			btrfs_err_rl(fs_info,
> >> +	"chunk start=%llu len=%llu doesn't have corresponding block group",
> >> +				     em->start, em->len);
> >> +			ret = -ENOENT;
> > 
> > -EUCLEAN ?
> 
> Either works for me.

That's not just a cosmetic change, there's a semantic difference between
the error codes, I maybe make that more explicit and not expect that this
is obvious.

ENOENT does not make much sense in this context, the caller (mount in
this case) cannot do anything about a code that says 'some internal
structure not found'.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo July 16, 2018, 1:57 p.m. UTC | #8
On 2018年07月16日 21:16, David Sterba wrote:
> On Fri, Jul 06, 2018 at 07:44:37AM +0800, Qu Wenruo wrote:
>>
>>
>> On 2018年07月05日 23:18, David Sterba wrote:
>>> On Tue, Jul 03, 2018 at 05:10:09PM +0800, Qu Wenruo wrote:
>>>> If a crafted btrfs has missing block group items, it could cause
>>>> unexpected behavior and breaks our expectation on 1:1
>>>> chunk<->block group mapping.
>>>>
>>>> Although we added block group -> chunk mapping check, we still need
>>>> chunk -> block group mapping check.
>>>>
>>>> This patch will do extra check to ensure each chunk has its
>>>> corresponding block group.
>>>>
>>>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
>>>> Reported-by: Xu Wen <wen.xu@gatech.edu>
>>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>>> ---
>>>>  fs/btrfs/extent-tree.c | 52 +++++++++++++++++++++++++++++++++++++++++-
>>>>  1 file changed, 51 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>>>> index 82b446f014b9..746095034ca2 100644
>>>> --- a/fs/btrfs/extent-tree.c
>>>> +++ b/fs/btrfs/extent-tree.c
>>>> @@ -10038,6 +10038,56 @@ static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,
>>>>  	return ret;
>>>>  }
>>>>  
>>>> +/*
>>>> + * Iterate all chunks and verify each of them has corresponding block group
>>>> + */
>>>> +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
>>>> +{
>>>> +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
>>>> +	struct extent_map *em;
>>>> +	struct btrfs_block_group_cache *bg;
>>>> +	u64 start = 0;
>>>> +	int ret = 0;
>>>> +
>>>> +	while (1) {
>>>> +		read_lock(&map_tree->map_tree.lock);
>>>> +		em = lookup_extent_mapping(&map_tree->map_tree, start,
>>>> +					   (u64)-1 - start);
>>>
>>> This needs a comment.
>>
>> For the @len part?
> 
> Yes, for the expression how it's calculated.
> 
>>>
>>>> +		read_unlock(&map_tree->map_tree.lock);
>>>> +		if (!em)
>>>> +			break;
>>>> +
>>>> +		bg = btrfs_lookup_block_group(fs_info, em->start);
>>>> +		if (!bg) {
>>>> +			btrfs_err_rl(fs_info,
>>>> +	"chunk start=%llu len=%llu doesn't have corresponding block group",
>>>> +				     em->start, em->len);
>>>> +			ret = -ENOENT;
>>>
>>> -EUCLEAN ?
>>
>> Either works for me.
> 
> That's not just a cosmetic change, there's a semantic difference between
> the error codes, I maybe make that more explicit and not expect that this
> is obvious.
> 
> ENOENT does not make much sense in this context, the caller (mount in
> this case) cannot do anything about a code that says 'some internal
> structure not found'.

The point here is, if every self-checker should only return -EUCLEAN, it
won't really indicate what's going wrong, except points to some
self-checker (and such self-checkers are growing larger than our
expectation already).

My practice here is, put some human readable and meaningful error
message. No matter what we choose to return, the error message should
tell us what's going wrong.

In this case, I don't really care the return value. If it's explicitly
needed to return -EUCLEAN, I could make all existing checker (from
tree-checker to chunk/bg/dev-extent checker) to return -EUCLEAN if
anything is wrong (and save several "ret = -EUCLEAN" lines).
The return value doesn't really have much meaning nowadays, it's the
error message important now.

Thanks,
Qu

> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
David Sterba July 17, 2018, 12:33 p.m. UTC | #9
On Mon, Jul 16, 2018 at 09:57:43PM +0800, Qu Wenruo wrote:
> >>> -EUCLEAN ?
> >>
> >> Either works for me.
> > 
> > That's not just a cosmetic change, there's a semantic difference between
> > the error codes, I maybe make that more explicit and not expect that this
> > is obvious.
> > 
> > ENOENT does not make much sense in this context, the caller (mount in
> > this case) cannot do anything about a code that says 'some internal
> > structure not found'.
> 
> The point here is, if every self-checker should only return -EUCLEAN, it
> won't really indicate what's going wrong, except points to some
> self-checker (and such self-checkers are growing larger than our
> expectation already).
> 
> My practice here is, put some human readable and meaningful error
> message. No matter what we choose to return, the error message should
> tell us what's going wrong.
> 
> In this case, I don't really care the return value. If it's explicitly
> needed to return -EUCLEAN, I could make all existing checker (from
> tree-checker to chunk/bg/dev-extent checker) to return -EUCLEAN if
> anything is wrong (and save several "ret = -EUCLEAN" lines).
> The return value doesn't really have much meaning nowadays, it's the
> error message important now.

Ok, I see what you mean. The message is important as it's otherwise
almost impossible to find where exactly the mount fails.

The error messages perhaps fall into several categories:

1) transient errors, some failure that happens before the filesystem state
   is fully examined

this is namely ENOMEM, or EINTR eg. returned by kthread_run

maybe also a failure on a multi-device filesystem when the devices
haven't been scanned yet

2) clearly some corruption/consistency condtion, with enough information
   available to decide

like a missing tree, most of the tree-checker would fall into this
category

3) same as the previous one, but thre's some extenal condition preventing
   a full check

that's eg. a real EIO after reading a tree block


The error code are IMO important to see how severe the problems are and
what's the expected solution. 2 is for 'check', 3 may need degraded
mount, 1 needs maybe more time to mount again.

With the error messages in place, 2 can be completely covered by
EUCLEAN. I briefly skimmed a few call paths and think that the 3
categories should be enough, but I'm also expecting some exceptions that
can be decided case by case.

The error codes are now not consistent, lots of EUCLEAN are historically
EIO, but before we start cleaning that up we should have at least some
guidelines. Please let me know what you think.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo July 17, 2018, 1:32 p.m. UTC | #10
On 2018年07月17日 20:33, David Sterba wrote:
> On Mon, Jul 16, 2018 at 09:57:43PM +0800, Qu Wenruo wrote:
>>>>> -EUCLEAN ?
>>>>
>>>> Either works for me.
>>>
>>> That's not just a cosmetic change, there's a semantic difference between
>>> the error codes, I maybe make that more explicit and not expect that this
>>> is obvious.
>>>
>>> ENOENT does not make much sense in this context, the caller (mount in
>>> this case) cannot do anything about a code that says 'some internal
>>> structure not found'.
>>
>> The point here is, if every self-checker should only return -EUCLEAN, it
>> won't really indicate what's going wrong, except points to some
>> self-checker (and such self-checkers are growing larger than our
>> expectation already).
>>
>> My practice here is, put some human readable and meaningful error
>> message. No matter what we choose to return, the error message should
>> tell us what's going wrong.
>>
>> In this case, I don't really care the return value. If it's explicitly
>> needed to return -EUCLEAN, I could make all existing checker (from
>> tree-checker to chunk/bg/dev-extent checker) to return -EUCLEAN if
>> anything is wrong (and save several "ret = -EUCLEAN" lines).
>> The return value doesn't really have much meaning nowadays, it's the
>> error message important now.
> 
> Ok, I see what you mean. The message is important as it's otherwise
> almost impossible to find where exactly the mount fails.
> 
> The error messages perhaps fall into several categories:
> 
> 1) transient errors, some failure that happens before the filesystem state
>    is fully examined
> 
> this is namely ENOMEM, or EINTR eg. returned by kthread_run

This standard is a little misleading, or did I misunderstand your category?

From the example error number, I could only find ENOMEM so
straightforward for end user/developer that we don't need any error
message to explain them.
Or this category is just for error no need of error message? (or can be
handled by btrfs-progs without any need of user interruption/decision?)

> 
> maybe also a failure on a multi-device filesystem when the devices
> haven't been scanned yet
> 
> 2) clearly some corruption/consistency condtion, with enough information
>    available to decide
> 
> like a missing tree, most of the tree-checker would fall into this
> category

This is pretty clear.

> 
> 3) same as the previous one, but there's some external condition preventing
>    a full check
> 
> that's eg. a real EIO after reading a tree block

That csum mismatch EIO with error message or really some error from
underlying layer like some ATA command failure?

> 
> 
> The error code are IMO important to see how severe the problems are and
> what's the expected solution. 2 is for 'check', 3 may need degraded
> mount, 1 needs maybe more time to mount again.

Category 2 for check is sure.
For other 2 cases it's a little hard to say.
Normally if we really hit some error we don't expect, under most case
the filesystem is already corrupted (e.g. a lot of errors of resuming
balance/mount failure finally turns out to be fs corruption).

If category is determined by the expected solution, most will just fall
into category 2), including most of errors we have in btrfs module
currently.

> 
> With the error messages in place, 2 can be completely covered by
> EUCLEAN. I briefly skimmed a few call paths and think that the 3
> categories should be enough, but I'm also expecting some exceptions that
> can be decided case by case.

For category 2), I think it's pretty clear and practically to use EUCLEAN.

For other categories I'm not really sure.

E.G what happens if we can't find certain backref when running delayed
refs? It's either a kernel bug or a corrupted fs.
Which category should it fit? Category 2? But we don't really know
what's going wrong.
For category 1/3? It won't really be fixed until we fix the bug or the fs.

More details examples would definitely help me understand the category.

> 
> The error codes are now not consistent, lots of EUCLEAN are historically
> EIO, but before we start cleaning that up we should have at least some
> guidelines. Please let me know what you think.
> 
At least for self-verification code it's pretty clear that we should
have error message for what's going wrong and what we expect, with
explicit EUCLEAN error number.

Thanks,
Qu
David Sterba July 19, 2018, 2:22 p.m. UTC | #11
On Tue, Jul 17, 2018 at 09:32:27PM +0800, Qu Wenruo wrote:
> On 2018年07月17日 20:33, David Sterba wrote:
> > On Mon, Jul 16, 2018 at 09:57:43PM +0800, Qu Wenruo wrote:
> >>>>> -EUCLEAN ?
> >>>>
> >>>> Either works for me.
> >>>
> >>> That's not just a cosmetic change, there's a semantic difference between
> >>> the error codes, I maybe make that more explicit and not expect that this
> >>> is obvious.
> >>>
> >>> ENOENT does not make much sense in this context, the caller (mount in
> >>> this case) cannot do anything about a code that says 'some internal
> >>> structure not found'.
> >>
> >> The point here is, if every self-checker should only return -EUCLEAN, it
> >> won't really indicate what's going wrong, except points to some
> >> self-checker (and such self-checkers are growing larger than our
> >> expectation already).
> >>
> >> My practice here is, put some human readable and meaningful error
> >> message. No matter what we choose to return, the error message should
> >> tell us what's going wrong.
> >>
> >> In this case, I don't really care the return value. If it's explicitly
> >> needed to return -EUCLEAN, I could make all existing checker (from
> >> tree-checker to chunk/bg/dev-extent checker) to return -EUCLEAN if
> >> anything is wrong (and save several "ret = -EUCLEAN" lines).
> >> The return value doesn't really have much meaning nowadays, it's the
> >> error message important now.
> > 
> > Ok, I see what you mean. The message is important as it's otherwise
> > almost impossible to find where exactly the mount fails.
> > 
> > The error messages perhaps fall into several categories:
> > 
> > 1) transient errors, some failure that happens before the filesystem state
> >    is fully examined
> > 
> > this is namely ENOMEM, or EINTR eg. returned by kthread_run
> 
> This standard is a little misleading, or did I misunderstand your category?
> 
> From the example error number, I could only find ENOMEM so
> straightforward for end user/developer that we don't need any error
> message to explain them.
> Or this category is just for error no need of error message? (or can be
> handled by btrfs-progs without any need of user interruption/decision?)

Yes, that's the point that there does not need to be any message. The
error code should be selfexplanatory.

> > maybe also a failure on a multi-device filesystem when the devices
> > haven't been scanned yet
> > 
> > 2) clearly some corruption/consistency condtion, with enough information
> >    available to decide
> > 
> > like a missing tree, most of the tree-checker would fall into this
> > category
> 
> This is pretty clear.
> 
> > 
> > 3) same as the previous one, but there's some external condition preventing
> >    a full check
> > 
> > that's eg. a real EIO after reading a tree block
> 
> That csum mismatch EIO with error message or really some error from
> underlying layer like some ATA command failure?

The idea for 3 is to cover hard errors, like the ATA error or anything
that block layer/driver returns. The checksum mismatch is a soft error
that still can be considered an EIO ("I can't give you the data"). So
there could be another category with the soft errors like csum
mismatches or generation mismatches etc.

> > The error code are IMO important to see how severe the problems are and
> > what's the expected solution. 2 is for 'check', 3 may need degraded
> > mount, 1 needs maybe more time to mount again.
> 
> Category 2 for check is sure.
> For other 2 cases it's a little hard to say.
> Normally if we really hit some error we don't expect, under most case
> the filesystem is already corrupted (e.g. a lot of errors of resuming
> balance/mount failure finally turns out to be fs corruption).
> 
> If category is determined by the expected solution, most will just fall
> into category 2), including most of errors we have in btrfs module
> currently.

Agreed that most things fall to 2 and we can't do much about it other
than try 'check' detect the scope of damage.

> > With the error messages in place, 2 can be completely covered by
> > EUCLEAN. I briefly skimmed a few call paths and think that the 3
> > categories should be enough, but I'm also expecting some exceptions that
> > can be decided case by case.
> 
> For category 2), I think it's pretty clear and practically to use EUCLEAN.
> 
> For other categories I'm not really sure.
> 
> E.G what happens if we can't find certain backref when running delayed
> refs? It's either a kernel bug or a corrupted fs.
> Which category should it fit? Category 2? But we don't really know
> what's going wrong.

If some critical piece of data is missing, like the backref, then it's a
structural bug and it's for 2. The reason why it happened we may not
known at the moment, but it is still a detected corruption. If it turns
out it's a bug, then it'll get fixed and the corruption will not be
detected for the 'bug' reason, but can be for a corruption or memory
bitflip error.

> For category 1/3? It won't really be fixed until we fix the bug or the fs.
> 
> More details examples would definitely help me understand the category.
> 
> > The error codes are now not consistent, lots of EUCLEAN are historically
> > EIO, but before we start cleaning that up we should have at least some
> > guidelines. Please let me know what you think.
> > 
> At least for self-verification code it's pretty clear that we should
> have error message for what's going wrong and what we expect, with
> explicit EUCLEAN error number.

Agreed.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 82b446f014b9..746095034ca2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10038,6 +10038,56 @@  static int check_exist_chunk(struct btrfs_fs_info *fs_info, u64 start, u64 len,
 	return ret;
 }
 
+/*
+ * Iterate all chunks and verify each of them has corresponding block group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map *em;
+	struct btrfs_block_group_cache *bg;
+	u64 start = 0;
+	int ret = 0;
+
+	while (1) {
+		read_lock(&map_tree->map_tree.lock);
+		em = lookup_extent_mapping(&map_tree->map_tree, start,
+					   (u64)-1 - start);
+		read_unlock(&map_tree->map_tree.lock);
+		if (!em)
+			break;
+
+		bg = btrfs_lookup_block_group(fs_info, em->start);
+		if (!bg) {
+			btrfs_err_rl(fs_info,
+	"chunk start=%llu len=%llu doesn't have corresponding block group",
+				     em->start, em->len);
+			ret = -ENOENT;
+			free_extent_map(em);
+			break;
+		}
+		if (bg->key.objectid != em->start ||
+		    bg->key.offset != em->len ||
+		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+			btrfs_err_rl(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match with block group start=%llu len=%llu flags=0x%llx",
+				em->start, em->len,
+				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+				bg->key.objectid, bg->key.offset,
+				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+			ret = -EUCLEAN;
+			free_extent_map(em);
+			btrfs_put_block_group(bg);
+			break;
+		}
+		start = em->start + em->len;
+		free_extent_map(em);
+		btrfs_put_block_group(bg);
+	}
+	return ret;
+}
+
 int btrfs_read_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_path *path;
@@ -10227,7 +10277,7 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 
 	btrfs_add_raid_kobjects(info);
 	init_global_block_rsv(info);
-	ret = 0;
+	ret = check_chunk_block_group_mappings(info);
 error:
 	btrfs_free_path(path);
 	return ret;