diff mbox

[RFC] btrfs: Speedup btrfs_read_block_groups()

Message ID 20180222045215.16426-1-wqu@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Qu Wenruo Feb. 22, 2018, 4:52 a.m. UTC
btrfs_read_block_groups() is used to build up the block group cache for
all block groups, so it will iterate all block group items in extent
tree.

For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
thousands times, which is the most time consuming part of mounting
btrfs.

So this patch will try to speed it up by:

1) Avoid unnecessary readahead
   We were using READA_FORWARD to search for block group item.
   However block group items are in fact scattered across quite a lot of
   leaves. Doing readahead will just waste our IO (especially important
   for HDD).

   In real world case, for a filesystem with 3T used space, it would
   have about 50K extent tree leaves, but only have 3K block group
   items. Meaning we need to iterate 16 leaves to meet one block group
   on average.

   So readahead won't help but waste slow HDD seeks.

2) Use chunk mapping to locate block group items
   Since one block group item always has one corresponding chunk item,
   we could use chunk mapping to get the block group item size.

   With block group item size, we can do a pinpoint tree search, instead
   of searching with some uncertain value and do forward search.

   In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
   current path, we could save such unnecessary tree block read.

Cc: Ellis H. Wilson III <ellisw@panasas.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
Since all my TB level storage is all occupied by my NAS, any feedback
(especially for the real world mount speed change) is welcome.
---
 fs/btrfs/extent-tree.c | 88 +++++++++++++++-----------------------------------
 1 file changed, 26 insertions(+), 62 deletions(-)

Comments

Qu Wenruo Feb. 22, 2018, 4:56 a.m. UTC | #1
On 2018年02月22日 12:52, Qu Wenruo wrote:
> btrfs_read_block_groups() is used to build up the block group cache for
> all block groups, so it will iterate all block group items in extent
> tree.
> 
> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
> thousands times, which is the most time consuming part of mounting
> btrfs.
> 
> So this patch will try to speed it up by:
> 
> 1) Avoid unnecessary readahead
>    We were using READA_FORWARD to search for block group item.
>    However block group items are in fact scattered across quite a lot of
>    leaves. Doing readahead will just waste our IO (especially important
>    for HDD).
> 
>    In real world case, for a filesystem with 3T used space, it would
>    have about 50K extent tree leaves, but only have 3K block group
>    items. Meaning we need to iterate 16 leaves to meet one block group
>    on average.
> 
>    So readahead won't help but waste slow HDD seeks.
> 
> 2) Use chunk mapping to locate block group items
>    Since one block group item always has one corresponding chunk item,
>    we could use chunk mapping to get the block group item size.
> 
>    With block group item size, we can do a pinpoint tree search, instead
>    of searching with some uncertain value and do forward search.
> 
>    In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>    current path, we could save such unnecessary tree block read.
> 
> Cc: Ellis H. Wilson III <ellisw@panasas.com>

Hi Ellis,

Would you please try this patch to see if it helps to speedup the mount
of your large filesystem?

Thanks,
Qu

> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Since all my TB level storage is all occupied by my NAS, any feedback
> (especially for the real world mount speed change) is welcome.
> ---
>  fs/btrfs/extent-tree.c | 88 +++++++++++++++-----------------------------------
>  1 file changed, 26 insertions(+), 62 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 2f4328511ac8..a3faa0cbe056 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -9713,60 +9713,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
>  	return ret;
>  }
>  
> -static int find_first_block_group(struct btrfs_fs_info *fs_info,
> -				  struct btrfs_path *path,
> -				  struct btrfs_key *key)
> -{
> -	struct btrfs_root *root = fs_info->extent_root;
> -	int ret = 0;
> -	struct btrfs_key found_key;
> -	struct extent_buffer *leaf;
> -	int slot;
> -
> -	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
> -	if (ret < 0)
> -		goto out;
> -
> -	while (1) {
> -		slot = path->slots[0];
> -		leaf = path->nodes[0];
> -		if (slot >= btrfs_header_nritems(leaf)) {
> -			ret = btrfs_next_leaf(root, path);
> -			if (ret == 0)
> -				continue;
> -			if (ret < 0)
> -				goto out;
> -			break;
> -		}
> -		btrfs_item_key_to_cpu(leaf, &found_key, slot);
> -
> -		if (found_key.objectid >= key->objectid &&
> -		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
> -			struct extent_map_tree *em_tree;
> -			struct extent_map *em;
> -
> -			em_tree = &root->fs_info->mapping_tree.map_tree;
> -			read_lock(&em_tree->lock);
> -			em = lookup_extent_mapping(em_tree, found_key.objectid,
> -						   found_key.offset);
> -			read_unlock(&em_tree->lock);
> -			if (!em) {
> -				btrfs_err(fs_info,
> -			"logical %llu len %llu found bg but no related chunk",
> -					  found_key.objectid, found_key.offset);
> -				ret = -ENOENT;
> -			} else {
> -				ret = 0;
> -			}
> -			free_extent_map(em);
> -			goto out;
> -		}
> -		path->slots[0]++;
> -	}
> -out:
> -	return ret;
> -}
> -
>  void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
>  {
>  	struct btrfs_block_group_cache *block_group;
> @@ -9988,12 +9934,15 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  {
>  	struct btrfs_path *path;
>  	int ret;
> +	struct btrfs_mapping_tree *map_tree = &info->mapping_tree;
> +	struct btrfs_root *extent_root = info->extent_root;
>  	struct btrfs_block_group_cache *cache;
>  	struct btrfs_space_info *space_info;
>  	struct btrfs_key key;
>  	struct btrfs_key found_key;
>  	struct extent_buffer *leaf;
>  	int need_clear = 0;
> +	u64 cur = 0;
>  	u64 cache_gen;
>  	u64 feature;
>  	int mixed;
> @@ -10001,13 +9950,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  	feature = btrfs_super_incompat_flags(info->super_copy);
>  	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
>  
> -	key.objectid = 0;
> -	key.offset = 0;
> -	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
>  	path = btrfs_alloc_path();
>  	if (!path)
>  		return -ENOMEM;
> -	path->reada = READA_FORWARD;
>  
>  	cache_gen = btrfs_super_cache_generation(info->super_copy);
>  	if (btrfs_test_opt(info, SPACE_CACHE) &&
> @@ -10017,10 +9962,30 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  		need_clear = 1;
>  
>  	while (1) {
> -		ret = find_first_block_group(info, path, &key);
> -		if (ret > 0)
> +		struct extent_map *em;
> +
> +		read_lock(&map_tree->map_tree.lock);
> +		em = lookup_extent_mapping(&map_tree->map_tree, cur,
> +					   ((u64)-1) - cur);
> +		read_unlock(&map_tree->map_tree.lock);
> +		if (!em)
>  			break;
> -		if (ret != 0)
> +
> +		key.objectid = em->start;
> +		key.offset = em->len;
> +		key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
> +		cur = em->start + em->len;
> +		free_extent_map(em);
> +
> +		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
> +		if (ret > 0) {
> +			WARN(1, KERN_ERR
> +			"chunk [%llu %llu) doesn't has its block group item\n",
> +			     key.objectid, key.objectid + key.offset);
> +			ret = -ENOENT;
> +			goto error;
> +		}
> +		if (ret < 0)
>  			goto error;
>  
>  		leaf = path->nodes[0];
> @@ -10062,7 +10027,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  			goto error;
>  		}
>  
> -		key.objectid = found_key.objectid + found_key.offset;
>  		btrfs_release_path(path);
>  
>  		/*
>
Nikolay Borisov Feb. 22, 2018, 8:36 a.m. UTC | #2
On 22.02.2018 06:52, Qu Wenruo wrote:
> btrfs_read_block_groups() is used to build up the block group cache for
> all block groups, so it will iterate all block group items in extent
> tree.
> 
> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
> thousands times, which is the most time consuming part of mounting
> btrfs.
> 
> So this patch will try to speed it up by:
> 
> 1) Avoid unnecessary readahead
>    We were using READA_FORWARD to search for block group item.
>    However block group items are in fact scattered across quite a lot of
>    leaves. Doing readahead will just waste our IO (especially important
>    for HDD).
> 
>    In real world case, for a filesystem with 3T used space, it would
>    have about 50K extent tree leaves, but only have 3K block group
>    items. Meaning we need to iterate 16 leaves to meet one block group
>    on average.
> 
>    So readahead won't help but waste slow HDD seeks.
> 
> 2) Use chunk mapping to locate block group items
>    Since one block group item always has one corresponding chunk item,
>    we could use chunk mapping to get the block group item size.
> 
>    With block group item size, we can do a pinpoint tree search, instead
>    of searching with some uncertain value and do forward search.
> 
>    In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>    current path, we could save such unnecessary tree block read.
> 
> Cc: Ellis H. Wilson III <ellisw@panasas.com>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Since all my TB level storage is all occupied by my NAS, any feedback
> (especially for the real world mount speed change) is welcome.
> ---
>  fs/btrfs/extent-tree.c | 88 +++++++++++++++-----------------------------------
>  1 file changed, 26 insertions(+), 62 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 2f4328511ac8..a3faa0cbe056 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -9713,60 +9713,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
>  	return ret;
>  }
>  
> -static int find_first_block_group(struct btrfs_fs_info *fs_info,
> -				  struct btrfs_path *path,
> -				  struct btrfs_key *key)
> -{
> -	struct btrfs_root *root = fs_info->extent_root;
> -	int ret = 0;
> -	struct btrfs_key found_key;
> -	struct extent_buffer *leaf;
> -	int slot;
> -
> -	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
> -	if (ret < 0)
> -		goto out;
> -
> -	while (1) {
> -		slot = path->slots[0];
> -		leaf = path->nodes[0];
> -		if (slot >= btrfs_header_nritems(leaf)) {
> -			ret = btrfs_next_leaf(root, path);
> -			if (ret == 0)
> -				continue;
> -			if (ret < 0)
> -				goto out;
> -			break;
> -		}
> -		btrfs_item_key_to_cpu(leaf, &found_key, slot);
> -
> -		if (found_key.objectid >= key->objectid &&
> -		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
> -			struct extent_map_tree *em_tree;
> -			struct extent_map *em;
> -
> -			em_tree = &root->fs_info->mapping_tree.map_tree;
> -			read_lock(&em_tree->lock);
> -			em = lookup_extent_mapping(em_tree, found_key.objectid,
> -						   found_key.offset);
> -			read_unlock(&em_tree->lock);
> -			if (!em) {
> -				btrfs_err(fs_info,
> -			"logical %llu len %llu found bg but no related chunk",
> -					  found_key.objectid, found_key.offset);
> -				ret = -ENOENT;
> -			} else {
> -				ret = 0;
> -			}
> -			free_extent_map(em);
> -			goto out;
> -		}
> -		path->slots[0]++;
> -	}
> -out:
> -	return ret;
> -}
> -
>  void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
>  {
>  	struct btrfs_block_group_cache *block_group;
> @@ -9988,12 +9934,15 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  {
>  	struct btrfs_path *path;
>  	int ret;
> +	struct btrfs_mapping_tree *map_tree = &info->mapping_tree;
> +	struct btrfs_root *extent_root = info->extent_root;
>  	struct btrfs_block_group_cache *cache;
>  	struct btrfs_space_info *space_info;
>  	struct btrfs_key key;
>  	struct btrfs_key found_key;
>  	struct extent_buffer *leaf;
>  	int need_clear = 0;
> +	u64 cur = 0;
>  	u64 cache_gen;
>  	u64 feature;
>  	int mixed;
> @@ -10001,13 +9950,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  	feature = btrfs_super_incompat_flags(info->super_copy);
>  	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
>  
> -	key.objectid = 0;
> -	key.offset = 0;
> -	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
>  	path = btrfs_alloc_path();
>  	if (!path)
>  		return -ENOMEM;
> -	path->reada = READA_FORWARD;
>  
>  	cache_gen = btrfs_super_cache_generation(info->super_copy);
>  	if (btrfs_test_opt(info, SPACE_CACHE) &&
> @@ -10017,10 +9962,30 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  		need_clear = 1;
>  
>  	while (1) {
> -		ret = find_first_block_group(info, path, &key);
> -		if (ret > 0)
> +		struct extent_map *em;
> +
> +		read_lock(&map_tree->map_tree.lock);
> +		em = lookup_extent_mapping(&map_tree->map_tree, cur,
> +					   ((u64)-1) - cur);
> +		read_unlock(&map_tree->map_tree.lock);
> +		if (!em)
>  			break;
> -		if (ret != 0)
> +
> +		key.objectid = em->start;
> +		key.offset = em->len;
> +		key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
> +		cur = em->start + em->len;
> +		free_extent_map(em);
> +
> +		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
> +		if (ret > 0) {
> +			WARN(1, KERN_ERR
> +			"chunk [%llu %llu) doesn't has its block group item\n",

I'd rephrase this to "chunk [%llu %llu) doesn't have matching block
group item"

> +			     key.objectid, key.objectid + key.offset);
> +			ret = -ENOENT;
> +			goto error;
> +		}

Looks good, howevr when the time for merging comes I'd rather have this
code be part of a function named find_block_group or some such. Let's
see if this code brings any improvements and then bikeshed on the details.

> +		if (ret < 0)
>  			goto error;
>  
>  		leaf = path->nodes[0];
> @@ -10062,7 +10027,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  			goto error;
>  		}
>  
> -		key.objectid = found_key.objectid + found_key.offset;
>  		btrfs_release_path(path);
>  
>  		/*
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo Feb. 22, 2018, 9:23 a.m. UTC | #3
[snip]
>> -}
>> -
>>  void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
>>  {
>>  	struct btrfs_block_group_cache *block_group;
>> @@ -9988,12 +9934,15 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>  {
>>  	struct btrfs_path *path;
>>  	int ret;
>> +	struct btrfs_mapping_tree *map_tree = &info->mapping_tree;
>> +	struct btrfs_root *extent_root = info->extent_root;
>>  	struct btrfs_block_group_cache *cache;
>>  	struct btrfs_space_info *space_info;
>>  	struct btrfs_key key;
>>  	struct btrfs_key found_key;
>>  	struct extent_buffer *leaf;
>>  	int need_clear = 0;
>> +	u64 cur = 0;
>>  	u64 cache_gen;
>>  	u64 feature;
>>  	int mixed;
>> @@ -10001,13 +9950,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>  	feature = btrfs_super_incompat_flags(info->super_copy);
>>  	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
>>  
>> -	key.objectid = 0;
>> -	key.offset = 0;
>> -	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
>>  	path = btrfs_alloc_path();
>>  	if (!path)
>>  		return -ENOMEM;
>> -	path->reada = READA_FORWARD;
>>  
>>  	cache_gen = btrfs_super_cache_generation(info->super_copy);
>>  	if (btrfs_test_opt(info, SPACE_CACHE) &&
>> @@ -10017,10 +9962,30 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>  		need_clear = 1;
>>  
>>  	while (1) {
>> -		ret = find_first_block_group(info, path, &key);
>> -		if (ret > 0)
>> +		struct extent_map *em;
>> +
>> +		read_lock(&map_tree->map_tree.lock);
>> +		em = lookup_extent_mapping(&map_tree->map_tree, cur,
>> +					   ((u64)-1) - cur);
>> +		read_unlock(&map_tree->map_tree.lock);
>> +		if (!em)
>>  			break;
>> -		if (ret != 0)
>> +
>> +		key.objectid = em->start;
>> +		key.offset = em->len;
>> +		key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
>> +		cur = em->start + em->len;
>> +		free_extent_map(em);
>> +
>> +		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
>> +		if (ret > 0) {
>> +			WARN(1, KERN_ERR
>> +			"chunk [%llu %llu) doesn't has its block group item\n",
> 
> I'd rephrase this to "chunk [%llu %llu) doesn't have matching block
> group item"

Sounds good.

Sorry for my poor English.

> 
>> +			     key.objectid, key.objectid + key.offset);
>> +			ret = -ENOENT;
>> +			goto error;
>> +		}
> 
> Looks good, howevr when the time for merging comes I'd rather have this
> code be part of a function named find_block_group or some such. Let's
> see if this code brings any improvements and then bikeshed on the details.

Isn't that the original find_first_block_group() function?

Thanks,
Qu

> 
>> +		if (ret < 0)
>>  			goto error;
>>  
>>  		leaf = path->nodes[0];
>> @@ -10062,7 +10027,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>  			goto error;
>>  		}
>>  
>> -		key.objectid = found_key.objectid + found_key.offset;
>>  		btrfs_release_path(path);
>>  
>>  		/*
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Nikolay Borisov Feb. 22, 2018, 9:27 a.m. UTC | #4
On 22.02.2018 11:23, Qu Wenruo wrote:
> 
> 
> [snip]
>>> -}
>>> -
>>>  void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
>>>  {
>>>  	struct btrfs_block_group_cache *block_group;
>>> @@ -9988,12 +9934,15 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>  {
>>>  	struct btrfs_path *path;
>>>  	int ret;
>>> +	struct btrfs_mapping_tree *map_tree = &info->mapping_tree;
>>> +	struct btrfs_root *extent_root = info->extent_root;
>>>  	struct btrfs_block_group_cache *cache;
>>>  	struct btrfs_space_info *space_info;
>>>  	struct btrfs_key key;
>>>  	struct btrfs_key found_key;
>>>  	struct extent_buffer *leaf;
>>>  	int need_clear = 0;
>>> +	u64 cur = 0;
>>>  	u64 cache_gen;
>>>  	u64 feature;
>>>  	int mixed;
>>> @@ -10001,13 +9950,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>  	feature = btrfs_super_incompat_flags(info->super_copy);
>>>  	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
>>>  
>>> -	key.objectid = 0;
>>> -	key.offset = 0;
>>> -	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
>>>  	path = btrfs_alloc_path();
>>>  	if (!path)
>>>  		return -ENOMEM;
>>> -	path->reada = READA_FORWARD;
>>>  
>>>  	cache_gen = btrfs_super_cache_generation(info->super_copy);
>>>  	if (btrfs_test_opt(info, SPACE_CACHE) &&
>>> @@ -10017,10 +9962,30 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>  		need_clear = 1;
>>>  
>>>  	while (1) {
>>> -		ret = find_first_block_group(info, path, &key);
>>> -		if (ret > 0)
>>> +		struct extent_map *em;
>>> +
>>> +		read_lock(&map_tree->map_tree.lock);
>>> +		em = lookup_extent_mapping(&map_tree->map_tree, cur,
>>> +					   ((u64)-1) - cur);
>>> +		read_unlock(&map_tree->map_tree.lock);
>>> +		if (!em)
>>>  			break;
>>> -		if (ret != 0)
>>> +
>>> +		key.objectid = em->start;
>>> +		key.offset = em->len;
>>> +		key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
>>> +		cur = em->start + em->len;
>>> +		free_extent_map(em);
>>> +
>>> +		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
>>> +		if (ret > 0) {
>>> +			WARN(1, KERN_ERR
>>> +			"chunk [%llu %llu) doesn't has its block group item\n",
>>
>> I'd rephrase this to "chunk [%llu %llu) doesn't have matching block
>> group item"
> 
> Sounds good.
> 
> Sorry for my poor English.

No need to apologise neither of us is a native speaker :)

> 
>>
>>> +			     key.objectid, key.objectid + key.offset);
>>> +			ret = -ENOENT;
>>> +			goto error;
>>> +		}
>>
>> Looks good, howevr when the time for merging comes I'd rather have this
>> code be part of a function named find_block_group or some such. Let's
>> see if this code brings any improvements and then bikeshed on the details.
> 
> Isn't that the original find_first_block_group() function?

Yes but since you have removed it and you are not really looking for the
"first" block group but just looking for a block group then the new name
makes more sense and collects the code in one place.

> 
> Thanks,
> Qu
> 
>>
>>> +		if (ret < 0)
>>>  			goto error;
>>>  
>>>  		leaf = path->nodes[0];
>>> @@ -10062,7 +10027,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>  			goto error;
>>>  		}
>>>  
>>> -		key.objectid = found_key.objectid + found_key.offset;
>>>  		btrfs_release_path(path);
>>>  
>>>  		/*
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ellis H. Wilson III Feb. 22, 2018, 4:31 p.m. UTC | #5
On 02/21/2018 11:56 PM, Qu Wenruo wrote:
> On 2018年02月22日 12:52, Qu Wenruo wrote:
>> btrfs_read_block_groups() is used to build up the block group cache for
>> all block groups, so it will iterate all block group items in extent
>> tree.
>>
>> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
>> thousands times, which is the most time consuming part of mounting
>> btrfs.
>>
>> So this patch will try to speed it up by:
>>
>> 1) Avoid unnecessary readahead
>>     We were using READA_FORWARD to search for block group item.
>>     However block group items are in fact scattered across quite a lot of
>>     leaves. Doing readahead will just waste our IO (especially important
>>     for HDD).
>>
>>     In real world case, for a filesystem with 3T used space, it would
>>     have about 50K extent tree leaves, but only have 3K block group
>>     items. Meaning we need to iterate 16 leaves to meet one block group
>>     on average.
>>
>>     So readahead won't help but waste slow HDD seeks.
>>
>> 2) Use chunk mapping to locate block group items
>>     Since one block group item always has one corresponding chunk item,
>>     we could use chunk mapping to get the block group item size.
>>
>>     With block group item size, we can do a pinpoint tree search, instead
>>     of searching with some uncertain value and do forward search.
>>
>>     In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>>     current path, we could save such unnecessary tree block read.
>>
>> Cc: Ellis H. Wilson III <ellisw@panasas.com>
> 
> Hi Ellis,
> 
> Would you please try this patch to see if it helps to speedup the mount
> of your large filesystem?

I will try either tomorrow or over the weekend.  I'm waiting on hardware 
to be able to build and load a custom kernel on.

Thanks so much for taking a stab at this!

ellis
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo Feb. 22, 2018, 11:37 p.m. UTC | #6
On 2018年02月23日 00:31, Ellis H. Wilson III wrote:
> On 02/21/2018 11:56 PM, Qu Wenruo wrote:
>> On 2018年02月22日 12:52, Qu Wenruo wrote:
>>> btrfs_read_block_groups() is used to build up the block group cache for
>>> all block groups, so it will iterate all block group items in extent
>>> tree.
>>>
>>> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
>>> thousands times, which is the most time consuming part of mounting
>>> btrfs.
>>>
>>> So this patch will try to speed it up by:
>>>
>>> 1) Avoid unnecessary readahead
>>>     We were using READA_FORWARD to search for block group item.
>>>     However block group items are in fact scattered across quite a
>>> lot of
>>>     leaves. Doing readahead will just waste our IO (especially important
>>>     for HDD).
>>>
>>>     In real world case, for a filesystem with 3T used space, it would
>>>     have about 50K extent tree leaves, but only have 3K block group
>>>     items. Meaning we need to iterate 16 leaves to meet one block group
>>>     on average.
>>>
>>>     So readahead won't help but waste slow HDD seeks.
>>>
>>> 2) Use chunk mapping to locate block group items
>>>     Since one block group item always has one corresponding chunk item,
>>>     we could use chunk mapping to get the block group item size.
>>>
>>>     With block group item size, we can do a pinpoint tree search,
>>> instead
>>>     of searching with some uncertain value and do forward search.
>>>
>>>     In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>>>     current path, we could save such unnecessary tree block read.
>>>
>>> Cc: Ellis H. Wilson III <ellisw@panasas.com>
>>
>> Hi Ellis,
>>
>> Would you please try this patch to see if it helps to speedup the mount
>> of your large filesystem?
> 
> I will try either tomorrow or over the weekend.  I'm waiting on hardware
> to be able to build and load a custom kernel on.

If you're using Archlinux, I could build the package for you.

(For other distributions, unfortunately I'm not that familiar with)

Thanks,
Qu
> 
> Thanks so much for taking a stab at this!
> 
> ellis
Holger Hoffstätte Feb. 23, 2018, 12:50 a.m. UTC | #7
On 02/22/18 05:52, Qu Wenruo wrote:
> btrfs_read_block_groups() is used to build up the block group cache for
> all block groups, so it will iterate all block group items in extent
> tree.
> 
> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
> thousands times, which is the most time consuming part of mounting
> btrfs.
> 
> So this patch will try to speed it up by:
> 
> 1) Avoid unnecessary readahead
>    We were using READA_FORWARD to search for block group item.
>    However block group items are in fact scattered across quite a lot of
>    leaves. Doing readahead will just waste our IO (especially important
>    for HDD).
> 
>    In real world case, for a filesystem with 3T used space, it would
>    have about 50K extent tree leaves, but only have 3K block group
>    items. Meaning we need to iterate 16 leaves to meet one block group
>    on average.
> 
>    So readahead won't help but waste slow HDD seeks.
> 
> 2) Use chunk mapping to locate block group items
>    Since one block group item always has one corresponding chunk item,
>    we could use chunk mapping to get the block group item size.
> 
>    With block group item size, we can do a pinpoint tree search, instead
>    of searching with some uncertain value and do forward search.
> 
>    In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>    current path, we could save such unnecessary tree block read.
> 
> Cc: Ellis H. Wilson III <ellisw@panasas.com>
> Signed-off-by: Qu Wenruo <wqu@suse.com>

Decided to give this a try & got interesting results!

The scene of the crime:
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Holger Hoffstätte Feb. 23, 2018, 1:12 a.m. UTC | #8
On 02/22/18 05:52, Qu Wenruo wrote:
> btrfs_read_block_groups() is used to build up the block group cache for
> all block groups, so it will iterate all block group items in extent
> tree.
> 
> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
> thousands times, which is the most time consuming part of mounting
> btrfs.
> 
> So this patch will try to speed it up by:
> 
> 1) Avoid unnecessary readahead
>    We were using READA_FORWARD to search for block group item.
>    However block group items are in fact scattered across quite a lot of
>    leaves. Doing readahead will just waste our IO (especially important
>    for HDD).
> 
>    In real world case, for a filesystem with 3T used space, it would
>    have about 50K extent tree leaves, but only have 3K block group
>    items. Meaning we need to iterate 16 leaves to meet one block group
>    on average.
> 
>    So readahead won't help but waste slow HDD seeks.
> 
> 2) Use chunk mapping to locate block group items
>    Since one block group item always has one corresponding chunk item,
>    we could use chunk mapping to get the block group item size.
> 
>    With block group item size, we can do a pinpoint tree search, instead
>    of searching with some uncertain value and do forward search.
> 
>    In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>    current path, we could save such unnecessary tree block read.
> 
> Cc: Ellis H. Wilson III <ellisw@panasas.com>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Since all my TB level storage is all occupied by my NAS, any feedback
> (especially for the real world mount speed change) is welcome.

(sorry for the previous mail without results..finger salad)

Decided to give this a try & got some nice results!
Probably not on the same scale & nonlinear behaviour as Ellis will
provide since I just don't have that much storage, but interesting
nevertheless.

$btrfs filesystem df /mnt/backup 
Data, single: total=1.10TiB, used=1.09TiB
System, DUP: total=32.00MiB, used=144.00KiB
Metadata, DUP: total=4.00GiB, used=2.23GiB
GlobalReserve, single: total=512.00MiB, used=0.00B

$btrfs-debug-tree -t chunk /dev/sdc1 | grep CHUNK_ITEM | wc -l
1137

current kernel (4.14++ with most of blk-mq+BFQ from 4.16):
mount /mnt/backup  0.00s user 0.02s system 1% cpu 1.211 total
mount /mnt/backup  0.00s user 0.02s system 2% cpu 1.122 total
mount /mnt/backup  0.00s user 0.02s system 2% cpu 1.236 total

patched:
mount /mnt/backup  0.00s user 0.02s system 1% cpu 1.070 total
mount /mnt/backup  0.00s user 0.02s system 1% cpu 1.056 total
mount /mnt/backup  0.00s user 0.02s system 1% cpu 1.058 total

That's not overwhelming, but still measurable and nice to have!

While I was at it I decided to fill up the drive to almost-max
~3.7TB and see how much slower it woulöd get...you won't believe
what happened next. :-)

$btrfs-debug-tree -t chunk /dev/sdc1 | grep CHUNK_ITEM | wc -l
3719

mount /mnt/backup  0.00s user 0.02s system 2% cpu 1.328 total
mount /mnt/backup  0.00s user 0.03s system 2% cpu 1.361 total
mount /mnt/backup  0.00s user 0.03s system 2% cpu 1.368 total

Over three times the data, almost the same mount time as before?
Yes please!

Overall this looks like a really nice improvement. Glad to see
that my suspicion about the (non)usefulness of the readhead turned
out to be true. :)

cheers,
Holger
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo Feb. 23, 2018, 2 a.m. UTC | #9
On 2018年02月23日 09:12, Holger Hoffstätte wrote:
> On 02/22/18 05:52, Qu Wenruo wrote:
>> btrfs_read_block_groups() is used to build up the block group cache for
>> all block groups, so it will iterate all block group items in extent
>> tree.
>>
>> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
>> thousands times, which is the most time consuming part of mounting
>> btrfs.
>>
>> So this patch will try to speed it up by:
>>
>> 1) Avoid unnecessary readahead
>>    We were using READA_FORWARD to search for block group item.
>>    However block group items are in fact scattered across quite a lot of
>>    leaves. Doing readahead will just waste our IO (especially important
>>    for HDD).
>>
>>    In real world case, for a filesystem with 3T used space, it would
>>    have about 50K extent tree leaves, but only have 3K block group
>>    items. Meaning we need to iterate 16 leaves to meet one block group
>>    on average.
>>
>>    So readahead won't help but waste slow HDD seeks.
>>
>> 2) Use chunk mapping to locate block group items
>>    Since one block group item always has one corresponding chunk item,
>>    we could use chunk mapping to get the block group item size.
>>
>>    With block group item size, we can do a pinpoint tree search, instead
>>    of searching with some uncertain value and do forward search.
>>
>>    In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>>    current path, we could save such unnecessary tree block read.
>>
>> Cc: Ellis H. Wilson III <ellisw@panasas.com>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>> Since all my TB level storage is all occupied by my NAS, any feedback
>> (especially for the real world mount speed change) is welcome.
> 
> (sorry for the previous mail without results..finger salad)
> 
> Decided to give this a try & got some nice results!
> Probably not on the same scale & nonlinear behaviour as Ellis will
> provide since I just don't have that much storage, but interesting
> nevertheless.
> 
> $btrfs filesystem df /mnt/backup 
> Data, single: total=1.10TiB, used=1.09TiB
> System, DUP: total=32.00MiB, used=144.00KiB
> Metadata, DUP: total=4.00GiB, used=2.23GiB
> GlobalReserve, single: total=512.00MiB, used=0.00B
> 
> $btrfs-debug-tree -t chunk /dev/sdc1 | grep CHUNK_ITEM | wc -l
> 1137
> 
> current kernel (4.14++ with most of blk-mq+BFQ from 4.16):
> mount /mnt/backup  0.00s user 0.02s system 1% cpu 1.211 total
> mount /mnt/backup  0.00s user 0.02s system 2% cpu 1.122 total
> mount /mnt/backup  0.00s user 0.02s system 2% cpu 1.236 total
> 
> patched:
> mount /mnt/backup  0.00s user 0.02s system 1% cpu 1.070 total
> mount /mnt/backup  0.00s user 0.02s system 1% cpu 1.056 total
> mount /mnt/backup  0.00s user 0.02s system 1% cpu 1.058 total
> 
> That's not overwhelming, but still measurable and nice to have!

Looks pretty good.

And pretty close to my prediction, about 15% improvement.

> 
> While I was at it I decided to fill up the drive to almost-max
> ~3.7TB and see how much slower it woulöd get...you won't believe
> what happened next. :-)
> 
> $btrfs-debug-tree -t chunk /dev/sdc1 | grep CHUNK_ITEM | wc -l
> 3719
> 
> mount /mnt/backup  0.00s user 0.02s system 2% cpu 1.328 total
> mount /mnt/backup  0.00s user 0.03s system 2% cpu 1.361 total
> mount /mnt/backup  0.00s user 0.03s system 2% cpu 1.368 total
> 
> Over three times the data, almost the same mount time as before?
> Yes please!
> 

This is indeed out of my expectation.

But after some think, this depends on how you fill the fs.

If fill using fallocate/plain dd, it means most of them will be data
using maximum extent size (128M), that will make new data blocks quite
compact, just 8+1 items for each block group.
So a leaf can easily contain around 100 chunks.

This makes block groups iteration for new data block groups quite fast,
as all related tree blocks will be in cache (also explains why cpu usage
get increaseed).

That's to say, if I didn't miss anything, for old kernel it should also
be able to mount in 2 secs.

This in fact would be the worst case for the patch, as normally
readahead would benifit such case.
But since it's pretty much the same mount time for old kernel, I would
call this a win!


> Overall this looks like a really nice improvement. Glad to see
> that my suspicion about the (non)usefulness of the readhead turned
> out to be true. :)

Yep, readahead part is new for such speedup patch(there is similar patch
years ago, but without disabling readahead), and pretty glad to see the
result.

(Although a little sad to know readahead is of little use)

Thanks for your test,
Qu

> 
> cheers,
> Holger
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Ellis H. Wilson III Feb. 23, 2018, 4:29 p.m. UTC | #10
On 02/22/2018 06:37 PM, Qu Wenruo wrote:
> On 2018年02月23日 00:31, Ellis H. Wilson III wrote:
>> On 02/21/2018 11:56 PM, Qu Wenruo wrote:
>>> On 2018年02月22日 12:52, Qu Wenruo wrote:
>>>> btrfs_read_block_groups() is used to build up the block group cache for
>>>> all block groups, so it will iterate all block group items in extent
>>>> tree.
>>>>
>>>> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
>>>> thousands times, which is the most time consuming part of mounting
>>>> btrfs.
>>>>
>>>> So this patch will try to speed it up by:
>>>>
>>>> 1) Avoid unnecessary readahead
>>>>      We were using READA_FORWARD to search for block group item.
>>>>      However block group items are in fact scattered across quite a
>>>> lot of
>>>>      leaves. Doing readahead will just waste our IO (especially important
>>>>      for HDD).
>>>>
>>>>      In real world case, for a filesystem with 3T used space, it would
>>>>      have about 50K extent tree leaves, but only have 3K block group
>>>>      items. Meaning we need to iterate 16 leaves to meet one block group
>>>>      on average.
>>>>
>>>>      So readahead won't help but waste slow HDD seeks.
>>>>
>>>> 2) Use chunk mapping to locate block group items
>>>>      Since one block group item always has one corresponding chunk item,
>>>>      we could use chunk mapping to get the block group item size.
>>>>
>>>>      With block group item size, we can do a pinpoint tree search,
>>>> instead
>>>>      of searching with some uncertain value and do forward search.
>>>>
>>>>      In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>>>>      current path, we could save such unnecessary tree block read.
>>>>
>>>> Cc: Ellis H. Wilson III <ellisw@panasas.com>
>>>
>>> Hi Ellis,
>>>
>>> Would you please try this patch to see if it helps to speedup the mount
>>> of your large filesystem?
>>
>> I will try either tomorrow or over the weekend.  I'm waiting on hardware
>> to be able to build and load a custom kernel on.
> 
> If you're using Archlinux, I could build the package for you.
> 
> (For other distributions, unfortunately I'm not that familiar with)
> 
> Thanks,
> Qu

No sweat.  I'm not running arch anywhere, so was glad to handle this myself.

Short story: It doesn't appear to have any notable impact on mount time.

Long story:
#Built a modern kernel:
git clone https://github.com/kdave/btrfs-devel
cd'd into btrfs-devel
copied my current kernel config in /boot to .config
make olddefconfig
make -j16
make modules_install
make install
grub2-mkconfig -o /boot/grub/grub.cfg
reboot

#Reran tests with vanilla 4.16.0-rc1+ kernel
As root, of the form: time mount /dev/sdb /mnt/btrfs
5 iteration average: 16.869s

#Applied your patch, rebuild, switched kernel module
wget -O - 'https://patchwork.kernel.org/patch/10234619/mbox' | git am -
make -j16
make modules_install
rmmod btrfs
modprobe btrfs

#Reran tests with patched 4.16.0-rc1+ kernel
As root, of the form: time mount /dev/sdb /mnt/btrfs
5 iteration average: 16.642s

So, there's a slight improvement against vanilla 4.16.0-rc1+, but it's 
still slightly slower than my original runs in 4.5.5, which got me 
16.553s.  In any event, most of this is statistically unsignificant 
since the standard deviation is about two tenths of a second.

So, my conclusion here is this problem needs to be handled at an 
architectural level to be truly solved (read: have mounts that few 
seconds at worst), which either requires:
a) On-disk format changes like you (Qu) suggested some time back for a 
tree of block groups or
b) Lazy block group walking post-mount and algorithms that can cope with 
making sub-optimal choices.  One would likely want to stonewall out 
certain operations until the lazy post-mount walk completed like 
balance, defrag, etc, that have more reason to require complete 
knowledge of the usage of each block group.

I may take a stab at b), but I'm first going to do the tests I promised 
relating to how mount times scale with increased capacity consumption 
for varying filesizes.

Best,

ellis
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo Feb. 24, 2018, 2:37 a.m. UTC | #11
On 2018年02月24日 00:29, Ellis H. Wilson III wrote:
> On 02/22/2018 06:37 PM, Qu Wenruo wrote:
>> On 2018年02月23日 00:31, Ellis H. Wilson III wrote:
>>> On 02/21/2018 11:56 PM, Qu Wenruo wrote:
>>>> On 2018年02月22日 12:52, Qu Wenruo wrote:
>>>>> btrfs_read_block_groups() is used to build up the block group cache
>>>>> for
>>>>> all block groups, so it will iterate all block group items in extent
>>>>> tree.
>>>>>
>>>>> For large filesystem (TB level), it will search for BLOCK_GROUP_ITEM
>>>>> thousands times, which is the most time consuming part of mounting
>>>>> btrfs.
>>>>>
>>>>> So this patch will try to speed it up by:
>>>>>
>>>>> 1) Avoid unnecessary readahead
>>>>>      We were using READA_FORWARD to search for block group item.
>>>>>      However block group items are in fact scattered across quite a
>>>>> lot of
>>>>>      leaves. Doing readahead will just waste our IO (especially
>>>>> important
>>>>>      for HDD).
>>>>>
>>>>>      In real world case, for a filesystem with 3T used space, it would
>>>>>      have about 50K extent tree leaves, but only have 3K block group
>>>>>      items. Meaning we need to iterate 16 leaves to meet one block
>>>>> group
>>>>>      on average.
>>>>>
>>>>>      So readahead won't help but waste slow HDD seeks.
>>>>>
>>>>> 2) Use chunk mapping to locate block group items
>>>>>      Since one block group item always has one corresponding chunk
>>>>> item,
>>>>>      we could use chunk mapping to get the block group item size.
>>>>>
>>>>>      With block group item size, we can do a pinpoint tree search,
>>>>> instead
>>>>>      of searching with some uncertain value and do forward search.
>>>>>
>>>>>      In some case, like next BLOCK_GROUP_ITEM is in the next leaf of
>>>>>      current path, we could save such unnecessary tree block read.
>>>>>
>>>>> Cc: Ellis H. Wilson III <ellisw@panasas.com>
>>>>
>>>> Hi Ellis,
>>>>
>>>> Would you please try this patch to see if it helps to speedup the mount
>>>> of your large filesystem?
>>>
>>> I will try either tomorrow or over the weekend.  I'm waiting on hardware
>>> to be able to build and load a custom kernel on.
>>
>> If you're using Archlinux, I could build the package for you.
>>
>> (For other distributions, unfortunately I'm not that familiar with)
>>
>> Thanks,
>> Qu
> 
> No sweat.  I'm not running arch anywhere, so was glad to handle this
> myself.
> 
> Short story: It doesn't appear to have any notable impact on mount time.
> 
> Long story:
> #Built a modern kernel:
> git clone https://github.com/kdave/btrfs-devel
> cd'd into btrfs-devel
> copied my current kernel config in /boot to .config
> make olddefconfig
> make -j16
> make modules_install
> make install
> grub2-mkconfig -o /boot/grub/grub.cfg
> reboot
> 
> #Reran tests with vanilla 4.16.0-rc1+ kernel
> As root, of the form: time mount /dev/sdb /mnt/btrfs
> 5 iteration average: 16.869s
> 
> #Applied your patch, rebuild, switched kernel module
> wget -O - 'https://patchwork.kernel.org/patch/10234619/mbox' | git am -
> make -j16
> make modules_install
> rmmod btrfs
> modprobe btrfs
> 
> #Reran tests with patched 4.16.0-rc1+ kernel
> As root, of the form: time mount /dev/sdb /mnt/btrfs
> 5 iteration average: 16.642s
> 
> So, there's a slight improvement against vanilla 4.16.0-rc1+, but it's
> still slightly slower than my original runs in 4.5.5, which got me
> 16.553s.  In any event, most of this is statistically unsignificant
> since the standard deviation is about two tenths of a second.

Yep, I also saw guys with similar report when the first version is sent.

Despite of the readahead things, the patch can only reduce disk reads
where block group items are located at the 1st slot of a leaf.

If all block group items are located from the 2nd slot of leaves, then
it shouldn't have much affect.
And I think your fs is already in such states so it doesn't have much
speedup.

BTW, you could also verify this by btrfs-debug-tree.

To get all block group items number:
# btrfs-debug-tree -t extent <device> | grep BLOCK_GROUP_ITEM | grep
item | nc -l

To get block group items which locates at 1st slot:
# btrfs-debug-tree -t extent <device> | grep BLOCK_GROUP_ITEM | grep
"item 0" | nc -l

And the ratio should imply how effective the patch will speedup.

>
> So, my conclusion here is this problem needs to be handled at an
> architectural level to be truly solved (read: have mounts that few
> seconds at worst), which either requires:
> a) On-disk format changes like you (Qu) suggested some time back for a
> tree of block groups or
> b) Lazy block group walking post-mount and algorithms that can cope with
> making sub-optimal choices.  One would likely want to stonewall out
> certain operations until the lazy post-mount walk completed like
> balance, defrag, etc, that have more reason to require complete
> knowledge of the usage of each block group.
> 
> I may take a stab at b), but I'm first going to do the tests I promised
> relating to how mount times scale with increased capacity consumption
> for varying filesizes.

Along with above ratio and the mount time difference would also help.

Thanks,
Qu

> 
> Best,
> 
> ellis
diff mbox

Patch

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2f4328511ac8..a3faa0cbe056 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9713,60 +9713,6 @@  int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 	return ret;
 }
 
-static int find_first_block_group(struct btrfs_fs_info *fs_info,
-				  struct btrfs_path *path,
-				  struct btrfs_key *key)
-{
-	struct btrfs_root *root = fs_info->extent_root;
-	int ret = 0;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
-	int slot;
-
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-
-	while (1) {
-		slot = path->slots[0];
-		leaf = path->nodes[0];
-		if (slot >= btrfs_header_nritems(leaf)) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret == 0)
-				continue;
-			if (ret < 0)
-				goto out;
-			break;
-		}
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
-		if (found_key.objectid >= key->objectid &&
-		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			struct extent_map_tree *em_tree;
-			struct extent_map *em;
-
-			em_tree = &root->fs_info->mapping_tree.map_tree;
-			read_lock(&em_tree->lock);
-			em = lookup_extent_mapping(em_tree, found_key.objectid,
-						   found_key.offset);
-			read_unlock(&em_tree->lock);
-			if (!em) {
-				btrfs_err(fs_info,
-			"logical %llu len %llu found bg but no related chunk",
-					  found_key.objectid, found_key.offset);
-				ret = -ENOENT;
-			} else {
-				ret = 0;
-			}
-			free_extent_map(em);
-			goto out;
-		}
-		path->slots[0]++;
-	}
-out:
-	return ret;
-}
-
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
 {
 	struct btrfs_block_group_cache *block_group;
@@ -9988,12 +9934,15 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_path *path;
 	int ret;
+	struct btrfs_mapping_tree *map_tree = &info->mapping_tree;
+	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_space_info *space_info;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 	int need_clear = 0;
+	u64 cur = 0;
 	u64 cache_gen;
 	u64 feature;
 	int mixed;
@@ -10001,13 +9950,9 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	feature = btrfs_super_incompat_flags(info->super_copy);
 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
 
-	key.objectid = 0;
-	key.offset = 0;
-	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = READA_FORWARD;
 
 	cache_gen = btrfs_super_cache_generation(info->super_copy);
 	if (btrfs_test_opt(info, SPACE_CACHE) &&
@@ -10017,10 +9962,30 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 		need_clear = 1;
 
 	while (1) {
-		ret = find_first_block_group(info, path, &key);
-		if (ret > 0)
+		struct extent_map *em;
+
+		read_lock(&map_tree->map_tree.lock);
+		em = lookup_extent_mapping(&map_tree->map_tree, cur,
+					   ((u64)-1) - cur);
+		read_unlock(&map_tree->map_tree.lock);
+		if (!em)
 			break;
-		if (ret != 0)
+
+		key.objectid = em->start;
+		key.offset = em->len;
+		key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+		cur = em->start + em->len;
+		free_extent_map(em);
+
+		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+		if (ret > 0) {
+			WARN(1, KERN_ERR
+			"chunk [%llu %llu) doesn't has its block group item\n",
+			     key.objectid, key.objectid + key.offset);
+			ret = -ENOENT;
+			goto error;
+		}
+		if (ret < 0)
 			goto error;
 
 		leaf = path->nodes[0];
@@ -10062,7 +10027,6 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 			goto error;
 		}
 
-		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(path);
 
 		/*