btrfs: speedup mount time with force readahead chunk tree
diff mbox series

Message ID 20200701092957.20870-1-robbieko@synology.com
State New
Headers show
Series
  • btrfs: speedup mount time with force readahead chunk tree
Related show

Commit Message

robbieko July 1, 2020, 9:29 a.m. UTC
From: Robbie Ko <robbieko@synology.com>

When mounting, we always need to read the whole chunk tree,
when there are too many chunk items, most of the time is
spent on btrfs_read_chunk_tree, because we only read one
leaf at a time.

We fix this by adding a new readahead mode READA_FORWARD_FORCE,
which reads all the leaves after the key in the node when
reading a level 1 node.

Signed-off-by: Robbie Ko <robbieko@synology.com>
---
 fs/btrfs/ctree.c   | 7 +++++--
 fs/btrfs/ctree.h   | 2 +-
 fs/btrfs/volumes.c | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

Comments

robbieko July 6, 2020, 6:13 a.m. UTC | #1
Does anyone have any suggestions?

robbieko 於 2020/7/1 下午5:29 寫道:
> From: Robbie Ko <robbieko@synology.com>
>
> When mounting, we always need to read the whole chunk tree,
> when there are too many chunk items, most of the time is
> spent on btrfs_read_chunk_tree, because we only read one
> leaf at a time.
>
> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
> which reads all the leaves after the key in the node when
> reading a level 1 node.
>
> Signed-off-by: Robbie Ko <robbieko@synology.com>
> ---
>   fs/btrfs/ctree.c   | 7 +++++--
>   fs/btrfs/ctree.h   | 2 +-
>   fs/btrfs/volumes.c | 1 +
>   3 files changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 3a7648bff42c..abb9108e2d7d 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
>   			if (nr == 0)
>   				break;
>   			nr--;
> -		} else if (path->reada == READA_FORWARD) {
> +		} else if (path->reada == READA_FORWARD || path->reada == READA_FORWARD_FORCE) {
>   			nr++;
>   			if (nr >= nritems)
>   				break;
> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
>   				break;
>   		}
>   		search = btrfs_node_blockptr(node, nr);
> -		if ((search <= target && target - search <= 65536) ||
> +		if ((path->reada == READA_FORWARD_FORCE) ||
> +		    (search <= target && target - search <= 65536) ||
>   		    (search > target && search - target <= 65536)) {
>   			readahead_tree_block(fs_info, search);
>   			nread += blocksize;
>   		}
>   		nscan++;
> +		if (path->reada == READA_FORWARD_FORCE)
> +			continue;
>   		if ((nread > 65536 || nscan > 32))
>   			break;
>   	}
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index d404cce8ae40..808bcbdc9530 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -353,7 +353,7 @@ struct btrfs_node {
>    * The slots array records the index of the item or block pointer
>    * used while walking the tree.
>    */
> -enum { READA_NONE, READA_BACK, READA_FORWARD };
> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>   struct btrfs_path {
>   	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>   	int slots[BTRFS_MAX_LEVEL];
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 0d6e785bcb98..78fd65abff69 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
>   	path = btrfs_alloc_path();
>   	if (!path)
>   		return -ENOMEM;
> +	path->reada = READA_FORWARD_FORCE;
>   
>   	/*
>   	 * uuid_mutex is needed only if we are mounting a sprout FS
Qu Wenruo July 6, 2020, 6:16 a.m. UTC | #2
On 2020/7/6 下午2:13, Robbie Ko wrote:
> Does anyone have any suggestions?

I believe David's suggestion on using regular readahead is already good
enough for chunk tree.

Especially since chunk tree is not really the main cause for slow mount.

Thanks,
Qu

> 
> robbieko 於 2020/7/1 下午5:29 寫道:
>> From: Robbie Ko <robbieko@synology.com>
>>
>> When mounting, we always need to read the whole chunk tree,
>> when there are too many chunk items, most of the time is
>> spent on btrfs_read_chunk_tree, because we only read one
>> leaf at a time.
>>
>> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
>> which reads all the leaves after the key in the node when
>> reading a level 1 node.
>>
>> Signed-off-by: Robbie Ko <robbieko@synology.com>
>> ---
>>   fs/btrfs/ctree.c   | 7 +++++--
>>   fs/btrfs/ctree.h   | 2 +-
>>   fs/btrfs/volumes.c | 1 +
>>   3 files changed, 7 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>> index 3a7648bff42c..abb9108e2d7d 100644
>> --- a/fs/btrfs/ctree.c
>> +++ b/fs/btrfs/ctree.c
>> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct
>> btrfs_fs_info *fs_info,
>>               if (nr == 0)
>>                   break;
>>               nr--;
>> -        } else if (path->reada == READA_FORWARD) {
>> +        } else if (path->reada == READA_FORWARD || path->reada ==
>> READA_FORWARD_FORCE) {
>>               nr++;
>>               if (nr >= nritems)
>>                   break;
>> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct
>> btrfs_fs_info *fs_info,
>>                   break;
>>           }
>>           search = btrfs_node_blockptr(node, nr);
>> -        if ((search <= target && target - search <= 65536) ||
>> +        if ((path->reada == READA_FORWARD_FORCE) ||
>> +            (search <= target && target - search <= 65536) ||
>>               (search > target && search - target <= 65536)) {
>>               readahead_tree_block(fs_info, search);
>>               nread += blocksize;
>>           }
>>           nscan++;
>> +        if (path->reada == READA_FORWARD_FORCE)
>> +            continue;
>>           if ((nread > 65536 || nscan > 32))
>>               break;
>>       }
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index d404cce8ae40..808bcbdc9530 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -353,7 +353,7 @@ struct btrfs_node {
>>    * The slots array records the index of the item or block pointer
>>    * used while walking the tree.
>>    */
>> -enum { READA_NONE, READA_BACK, READA_FORWARD };
>> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>>   struct btrfs_path {
>>       struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>>       int slots[BTRFS_MAX_LEVEL];
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 0d6e785bcb98..78fd65abff69 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info
>> *fs_info)
>>       path = btrfs_alloc_path();
>>       if (!path)
>>           return -ENOMEM;
>> +    path->reada = READA_FORWARD_FORCE;
>>         /*
>>        * uuid_mutex is needed only if we are mounting a sprout FS
robbieko July 6, 2020, 8:05 a.m. UTC | #3
I've known btrfs_read_block_groups for a long time,

we can use BG_TREE freature to speed up btrfs_read_block_groups.

https://lwn.net/Articles/801990/


But reading the chunk tree also takes some time,

we can speed up the chunk tree by using the readahead mechanism.

Why we not just use regular forward readahead?
- Because the regular forward readahead ,
   reads only the logical address adjacent to the 64k,
   but the logical address of the next leaf may not be in 64k.

I have a test environment as follows:

200TB btrfs volume: used 192TB

Data, single: total=192.00TiB, used=192.00TiB
System, DUP: total=40.00MiB, used=19.91MiB
Metadata, DUP: total=63.00GiB, used=46.46GiB
GlobalReserve, single: total=2.00GiB, used=0.00B

chunk tree level : 2
chunk tree tree:
   nodes: 4
   leaves: 1270
   total: 1274
chunk tree size: 19.9 MB
SYSTEM chunks count : 2 (8MB, 32MB)

btrfs_read_chunk_tree spends the following time :

before: 1.89s

patch: 0.27s

Speed increase of about 85%.

Between the chunk tree leaves, there will be a different SYSTEM chunk,

when the The more frequent the chunk allocate/remove, the larger the gap 
between the leaves.

e.g. chunk tree node :
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85020014280704) block 
79866020003840 (4874635010) gen 12963
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85185370521600) block 28999680 
(1770) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85351800504320) block 
79866020347904 (4874635031) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85518230487040) block 
79866020102144 (4874635016) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85684660469760) block 
79866020118528 (4874635017) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85851090452480) block 
79866020134912 (4874635018) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86017520435200) block 29261824 
(1786) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86183950417920) block 
79866020397056 (4874635034) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86350380400640) block 
79866020151296 (4874635019) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86516810383360) block 
79866020167680 (4874635020) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86683240366080) block 
79866020184064 (4874635021) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86849670348800) block 
79866020200448 (4874635022) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87016100331520) block 29065216 
(1774) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87182530314240) block 
79866020315136 (4874635029) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87348960296960) block 
79866020331520 (4874635030) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87515390279680) block 
79866020413440 (4874635035) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87681820262400) block 
79866020429824 (4874635036) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87848250245120) block 29294592 
(1788) gen 12968
     key (FIRST_CHUNK_TREE CHUNK_ITEM 88014680227840) block 
79866020544512 (4874635043) gen 12968


With 1PB of btrfs volume, we will have more SYSTEM CHUNK,

and each chunk tree leaf will be more fragmented,

and the time difference will be larger.


Qu Wenruo 於 2020/7/6 下午2:16 寫道:
>
> On 2020/7/6 下午2:13, Robbie Ko wrote:
>> Does anyone have any suggestions?
> I believe David's suggestion on using regular readahead is already good
> enough for chunk tree.
>
> Especially since chunk tree is not really the main cause for slow mount.
>
> Thanks,
> Qu
>
>> robbieko 於 2020/7/1 下午5:29 寫道:
>>> From: Robbie Ko <robbieko@synology.com>
>>>
>>> When mounting, we always need to read the whole chunk tree,
>>> when there are too many chunk items, most of the time is
>>> spent on btrfs_read_chunk_tree, because we only read one
>>> leaf at a time.
>>>
>>> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
>>> which reads all the leaves after the key in the node when
>>> reading a level 1 node.
>>>
>>> Signed-off-by: Robbie Ko <robbieko@synology.com>
>>> ---
>>>    fs/btrfs/ctree.c   | 7 +++++--
>>>    fs/btrfs/ctree.h   | 2 +-
>>>    fs/btrfs/volumes.c | 1 +
>>>    3 files changed, 7 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>>> index 3a7648bff42c..abb9108e2d7d 100644
>>> --- a/fs/btrfs/ctree.c
>>> +++ b/fs/btrfs/ctree.c
>>> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct
>>> btrfs_fs_info *fs_info,
>>>                if (nr == 0)
>>>                    break;
>>>                nr--;
>>> -        } else if (path->reada == READA_FORWARD) {
>>> +        } else if (path->reada == READA_FORWARD || path->reada ==
>>> READA_FORWARD_FORCE) {
>>>                nr++;
>>>                if (nr >= nritems)
>>>                    break;
>>> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct
>>> btrfs_fs_info *fs_info,
>>>                    break;
>>>            }
>>>            search = btrfs_node_blockptr(node, nr);
>>> -        if ((search <= target && target - search <= 65536) ||
>>> +        if ((path->reada == READA_FORWARD_FORCE) ||
>>> +            (search <= target && target - search <= 65536) ||
>>>                (search > target && search - target <= 65536)) {
>>>                readahead_tree_block(fs_info, search);
>>>                nread += blocksize;
>>>            }
>>>            nscan++;
>>> +        if (path->reada == READA_FORWARD_FORCE)
>>> +            continue;
>>>            if ((nread > 65536 || nscan > 32))
>>>                break;
>>>        }
>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>> index d404cce8ae40..808bcbdc9530 100644
>>> --- a/fs/btrfs/ctree.h
>>> +++ b/fs/btrfs/ctree.h
>>> @@ -353,7 +353,7 @@ struct btrfs_node {
>>>     * The slots array records the index of the item or block pointer
>>>     * used while walking the tree.
>>>     */
>>> -enum { READA_NONE, READA_BACK, READA_FORWARD };
>>> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>>>    struct btrfs_path {
>>>        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>>>        int slots[BTRFS_MAX_LEVEL];
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index 0d6e785bcb98..78fd65abff69 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info
>>> *fs_info)
>>>        path = btrfs_alloc_path();
>>>        if (!path)
>>>            return -ENOMEM;
>>> +    path->reada = READA_FORWARD_FORCE;
>>>          /*
>>>         * uuid_mutex is needed only if we are mounting a sprout FS
Qu Wenruo July 6, 2020, 8:28 a.m. UTC | #4
On 2020/7/6 下午4:05, Robbie Ko wrote:
> 
> I've known btrfs_read_block_groups for a long time,
> 
> we can use BG_TREE freature to speed up btrfs_read_block_groups.
> 
> https://lwn.net/Articles/801990/
> 
> 
> But reading the chunk tree also takes some time,
> 
> we can speed up the chunk tree by using the readahead mechanism.
> 
> Why we not just use regular forward readahead?
> - Because the regular forward readahead ,
>   reads only the logical address adjacent to the 64k,
>   but the logical address of the next leaf may not be in 64k.

Great, please add these into the changelog for the need of
READA_FORWARD_FORCE.

But on the other hand, it looks like we could change READA_FORWARD
without introducing the _FORCE version,
as _FORCE variant is what we really want.


That logical bytenr limit looks not reasonable to me, which makes that
READA_FORWARD near useless to me.

Although in that direction, we also need to verify all existing
READA_FORWARD are really correctly.

Personally I tend to prefer change existing READA_FORWARD and review all
existing callers.

> 
> I have a test environment as follows:
> 
> 200TB btrfs volume: used 192TB
> 
> Data, single: total=192.00TiB, used=192.00TiB
> System, DUP: total=40.00MiB, used=19.91MiB
> Metadata, DUP: total=63.00GiB, used=46.46GiB
> GlobalReserve, single: total=2.00GiB, used=0.00B
> 
> chunk tree level : 2
> chunk tree tree:
>   nodes: 4
>   leaves: 1270
>   total: 1274
> chunk tree size: 19.9 MB
> SYSTEM chunks count : 2 (8MB, 32MB)
> 
> btrfs_read_chunk_tree spends the following time :
> 
> before: 1.89s
> 
> patch: 0.27s

That's great, although 2s is already fast enough to me :)

Thanks,
Qu

> 
> Speed increase of about 85%.
> 
> Between the chunk tree leaves, there will be a different SYSTEM chunk,
> 
> when the The more frequent the chunk allocate/remove, the larger the gap
> between the leaves.
> 
> e.g. chunk tree node :
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85020014280704) block
> 79866020003840 (4874635010) gen 12963
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85185370521600) block 28999680
> (1770) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85351800504320) block
> 79866020347904 (4874635031) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85518230487040) block
> 79866020102144 (4874635016) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85684660469760) block
> 79866020118528 (4874635017) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85851090452480) block
> 79866020134912 (4874635018) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86017520435200) block 29261824
> (1786) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86183950417920) block
> 79866020397056 (4874635034) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86350380400640) block
> 79866020151296 (4874635019) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86516810383360) block
> 79866020167680 (4874635020) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86683240366080) block
> 79866020184064 (4874635021) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86849670348800) block
> 79866020200448 (4874635022) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87016100331520) block 29065216
> (1774) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87182530314240) block
> 79866020315136 (4874635029) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87348960296960) block
> 79866020331520 (4874635030) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87515390279680) block
> 79866020413440 (4874635035) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87681820262400) block
> 79866020429824 (4874635036) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87848250245120) block 29294592
> (1788) gen 12968
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 88014680227840) block
> 79866020544512 (4874635043) gen 12968
> 
> 
> With 1PB of btrfs volume, we will have more SYSTEM CHUNK,
> 
> and each chunk tree leaf will be more fragmented,
> 
> and the time difference will be larger.
> 
> 
> Qu Wenruo 於 2020/7/6 下午2:16 寫道:
>>
>> On 2020/7/6 下午2:13, Robbie Ko wrote:
>>> Does anyone have any suggestions?
>> I believe David's suggestion on using regular readahead is already good
>> enough for chunk tree.
>>
>> Especially since chunk tree is not really the main cause for slow mount.
>>
>> Thanks,
>> Qu
>>
>>> robbieko 於 2020/7/1 下午5:29 寫道:
>>>> From: Robbie Ko <robbieko@synology.com>
>>>>
>>>> When mounting, we always need to read the whole chunk tree,
>>>> when there are too many chunk items, most of the time is
>>>> spent on btrfs_read_chunk_tree, because we only read one
>>>> leaf at a time.
>>>>
>>>> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
>>>> which reads all the leaves after the key in the node when
>>>> reading a level 1 node.
>>>>
>>>> Signed-off-by: Robbie Ko <robbieko@synology.com>
>>>> ---
>>>>    fs/btrfs/ctree.c   | 7 +++++--
>>>>    fs/btrfs/ctree.h   | 2 +-
>>>>    fs/btrfs/volumes.c | 1 +
>>>>    3 files changed, 7 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>>>> index 3a7648bff42c..abb9108e2d7d 100644
>>>> --- a/fs/btrfs/ctree.c
>>>> +++ b/fs/btrfs/ctree.c
>>>> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct
>>>> btrfs_fs_info *fs_info,
>>>>                if (nr == 0)
>>>>                    break;
>>>>                nr--;
>>>> -        } else if (path->reada == READA_FORWARD) {
>>>> +        } else if (path->reada == READA_FORWARD || path->reada ==
>>>> READA_FORWARD_FORCE) {
>>>>                nr++;
>>>>                if (nr >= nritems)
>>>>                    break;
>>>> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct
>>>> btrfs_fs_info *fs_info,
>>>>                    break;
>>>>            }
>>>>            search = btrfs_node_blockptr(node, nr);
>>>> -        if ((search <= target && target - search <= 65536) ||
>>>> +        if ((path->reada == READA_FORWARD_FORCE) ||
>>>> +            (search <= target && target - search <= 65536) ||
>>>>                (search > target && search - target <= 65536)) {
>>>>                readahead_tree_block(fs_info, search);
>>>>                nread += blocksize;
>>>>            }
>>>>            nscan++;
>>>> +        if (path->reada == READA_FORWARD_FORCE)
>>>> +            continue;
>>>>            if ((nread > 65536 || nscan > 32))
>>>>                break;
>>>>        }
>>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>>> index d404cce8ae40..808bcbdc9530 100644
>>>> --- a/fs/btrfs/ctree.h
>>>> +++ b/fs/btrfs/ctree.h
>>>> @@ -353,7 +353,7 @@ struct btrfs_node {
>>>>     * The slots array records the index of the item or block pointer
>>>>     * used while walking the tree.
>>>>     */
>>>> -enum { READA_NONE, READA_BACK, READA_FORWARD };
>>>> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>>>>    struct btrfs_path {
>>>>        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>>>>        int slots[BTRFS_MAX_LEVEL];
>>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>>> index 0d6e785bcb98..78fd65abff69 100644
>>>> --- a/fs/btrfs/volumes.c
>>>> +++ b/fs/btrfs/volumes.c
>>>> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info
>>>> *fs_info)
>>>>        path = btrfs_alloc_path();
>>>>        if (!path)
>>>>            return -ENOMEM;
>>>> +    path->reada = READA_FORWARD_FORCE;
>>>>          /*
>>>>         * uuid_mutex is needed only if we are mounting a sprout FS
Nikolay Borisov July 6, 2020, 8:37 a.m. UTC | #5
On 6.07.20 г. 11:28 ч., Qu Wenruo wrote:
> 
> 
> On 2020/7/6 下午4:05, Robbie Ko wrote:
>>
>> I've known btrfs_read_block_groups for a long time,
>>
>> we can use BG_TREE freature to speed up btrfs_read_block_groups.
>>
>> https://lwn.net/Articles/801990/
>>
>>
>> But reading the chunk tree also takes some time,
>>
>> we can speed up the chunk tree by using the readahead mechanism.
>>
>> Why we not just use regular forward readahead?
>> - Because the regular forward readahead ,
>>   reads only the logical address adjacent to the 64k,
>>   but the logical address of the next leaf may not be in 64k.
> 
> Great, please add these into the changelog for the need of
> READA_FORWARD_FORCE.

<nod> Performance patches should come with data backing their
performance improvements claims.

> 
> But on the other hand, it looks like we could change READA_FORWARD
> without introducing the _FORCE version,
> as _FORCE variant is what we really want.
> 
> 
> That logical bytenr limit looks not reasonable to me, which makes that
> READA_FORWARD near useless to me.
> 
> Although in that direction, we also need to verify all existing
> READA_FORWARD are really correctly.
> 
> Personally I tend to prefer change existing READA_FORWARD and review all
> existing callers.

I agree, we should ideally revise the existing READA_FORWARD and give it
better semantics if the current ones are needlessly rigid. Only if this
improves way too cumbersome i.e breaking assumption which are made
should we introduce yet another mode. Because this other mode is really
just special casing and we should avoid special cases as much as possible.


<snip>
robbieko July 6, 2020, 10:07 a.m. UTC | #6
Nikolay Borisov 於 2020/7/6 下午4:37 寫道:
>
> On 6.07.20 г. 11:28 ч., Qu Wenruo wrote:
>>
>> On 2020/7/6 下午4:05, Robbie Ko wrote:
>>> I've known btrfs_read_block_groups for a long time,
>>>
>>> we can use BG_TREE freature to speed up btrfs_read_block_groups.
>>>
>>> https://lwn.net/Articles/801990/
>>>
>>>
>>> But reading the chunk tree also takes some time,
>>>
>>> we can speed up the chunk tree by using the readahead mechanism.
>>>
>>> Why we not just use regular forward readahead?
>>> - Because the regular forward readahead ,
>>>    reads only the logical address adjacent to the 64k,
>>>    but the logical address of the next leaf may not be in 64k.
>> Great, please add these into the changelog for the need of
>> READA_FORWARD_FORCE.
> <nod> Performance patches should come with data backing their
> performance improvements claims.

Ok, I will add these into the changelog.


>
>> But on the other hand, it looks like we could change READA_FORWARD
>> without introducing the _FORCE version,
>> as _FORCE variant is what we really want.
>>
>>
>> That logical bytenr limit looks not reasonable to me, which makes that
>> READA_FORWARD near useless to me.
>>
>> Although in that direction, we also need to verify all existing
>> READA_FORWARD are really correctly.
>>
>> Personally I tend to prefer change existing READA_FORWARD and review all
>> existing callers.
> I agree, we should ideally revise the existing READA_FORWARD and give it
> better semantics if the current ones are needlessly rigid. Only if this
> improves way too cumbersome i.e breaking assumption which are made
> should we introduce yet another mode. Because this other mode is really
> just special casing and we should avoid special cases as much as possible.


Ok, I'll modify the existing READA_FORWARD mechanism to make it better

and send a V2 patch.


>
>
> <snip>

Patch
diff mbox series

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3a7648bff42c..abb9108e2d7d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2194,7 +2194,7 @@  static void reada_for_search(struct btrfs_fs_info *fs_info,
 			if (nr == 0)
 				break;
 			nr--;
-		} else if (path->reada == READA_FORWARD) {
+		} else if (path->reada == READA_FORWARD || path->reada == READA_FORWARD_FORCE) {
 			nr++;
 			if (nr >= nritems)
 				break;
@@ -2205,12 +2205,15 @@  static void reada_for_search(struct btrfs_fs_info *fs_info,
 				break;
 		}
 		search = btrfs_node_blockptr(node, nr);
-		if ((search <= target && target - search <= 65536) ||
+		if ((path->reada == READA_FORWARD_FORCE) ||
+		    (search <= target && target - search <= 65536) ||
 		    (search > target && search - target <= 65536)) {
 			readahead_tree_block(fs_info, search);
 			nread += blocksize;
 		}
 		nscan++;
+		if (path->reada == READA_FORWARD_FORCE)
+			continue;
 		if ((nread > 65536 || nscan > 32))
 			break;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d404cce8ae40..808bcbdc9530 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -353,7 +353,7 @@  struct btrfs_node {
  * The slots array records the index of the item or block pointer
  * used while walking the tree.
  */
-enum { READA_NONE, READA_BACK, READA_FORWARD };
+enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
 struct btrfs_path {
 	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0d6e785bcb98..78fd65abff69 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7043,6 +7043,7 @@  int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = READA_FORWARD_FORCE;
 
 	/*
 	 * uuid_mutex is needed only if we are mounting a sprout FS