diff mbox series

[RFC,1/2] btrfs: map uncontinuous extent buffer pages into virtual address space

Message ID 46e2952cfe5b76733f5c2b22f11832f062be6200.1690249862.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: make extent buffer memory continuous | expand

Commit Message

Qu Wenruo July 25, 2023, 2:57 a.m. UTC
Currently btrfs implements its extent buffer read-write using various
helpers doing cross-page handling for the pages array.

However other filesystems like XFS is mapping the pages into kernel
virtual address space, greatly simplify the access.

This patch would learn from XFS and map the pages into virtual address
space, if and only if the pages are not physically continuous.
(Note, a single page counts as physically continuous.)

For now we only do the map, but not yet really utilize the mapped
address.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/extent_io.c | 70 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h |  7 +++++
 2 files changed, 77 insertions(+)

Comments

David Sterba July 27, 2023, 2:18 p.m. UTC | #1
On Tue, Jul 25, 2023 at 10:57:21AM +0800, Qu Wenruo wrote:
> Currently btrfs implements its extent buffer read-write using various
> helpers doing cross-page handling for the pages array.
> 
> However other filesystems like XFS is mapping the pages into kernel
> virtual address space, greatly simplify the access.
> 
> This patch would learn from XFS and map the pages into virtual address
> space, if and only if the pages are not physically continuous.
> (Note, a single page counts as physically continuous.)
> 
> For now we only do the map, but not yet really utilize the mapped
> address.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/extent_io.c | 70 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/extent_io.h |  7 +++++
>  2 files changed, 77 insertions(+)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4144c649718e..f40d48f641c0 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -14,6 +14,7 @@
>  #include <linux/pagevec.h>
>  #include <linux/prefetch.h>
>  #include <linux/fsverity.h>
> +#include <linux/vmalloc.h>
>  #include "misc.h"
>  #include "extent_io.h"
>  #include "extent-io-tree.h"
> @@ -3206,6 +3207,8 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
>  	ASSERT(!extent_buffer_under_io(eb));
>  
>  	num_pages = num_extent_pages(eb);
> +	if (eb->vaddr)
> +		vm_unmap_ram(eb->vaddr, num_pages);
>  	for (i = 0; i < num_pages; i++) {
>  		struct page *page = eb->pages[i];
>  
> @@ -3255,6 +3258,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>  {
>  	int i;
>  	struct extent_buffer *new;
> +	bool pages_contig = true;
>  	int num_pages = num_extent_pages(src);
>  	int ret;
>  
> @@ -3279,6 +3283,9 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>  		int ret;
>  		struct page *p = new->pages[i];
>  
> +		if (i && p != new->pages[i - 1] + 1)
> +			pages_contig = false;
> +
>  		ret = attach_extent_buffer_page(new, p, NULL);
>  		if (ret < 0) {
>  			btrfs_release_extent_buffer(new);
> @@ -3286,6 +3293,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>  		}
>  		WARN_ON(PageDirty(p));
>  	}
> +	if (!pages_contig) {
> +		unsigned int nofs_flag;
> +		int retried = 0;
> +
> +		nofs_flag = memalloc_nofs_save();
> +		do {
> +			new->vaddr = vm_map_ram(new->pages, num_pages, -1);
> +			if (new->vaddr)
> +				break;
> +			vm_unmap_aliases();
> +		} while ((retried++) <= 1);
> +		memalloc_nofs_restore(nofs_flag);
> +		if (!new->vaddr) {
> +			btrfs_release_extent_buffer(new);
> +			return NULL;
> +		}
> +	}
>  	copy_extent_buffer_full(new, src);
>  	set_extent_buffer_uptodate(new);
>  
> @@ -3296,6 +3320,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>  						  u64 start, unsigned long len)
>  {
>  	struct extent_buffer *eb;
> +	bool pages_contig = true;
>  	int num_pages;
>  	int i;
>  	int ret;
> @@ -3312,11 +3337,29 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>  	for (i = 0; i < num_pages; i++) {
>  		struct page *p = eb->pages[i];
>  
> +		if (i && p != eb->pages[i - 1] + 1)
> +			pages_contig = false;

Chances that allocated pages in eb->pages will be contiguous decrease
over time basically to zero, because even one page out of order will
ruin it. This means we can assume that virtual mapping will have to be
used almost every time.

The virtual mapping can also fail and we have no fallback and there are
two more places when allocating extent buffer can fail.

There's alloc_pages(gfp, order) that can try to allocate contiguous
pages of a given order, and we have nodesize always matching power of
two so we could use it. Although this also forces alignment to the same
order, which we don't need, and adds to the failure modes.
Qu Wenruo July 27, 2023, 10:24 p.m. UTC | #2
On 2023/7/27 22:18, David Sterba wrote:
> On Tue, Jul 25, 2023 at 10:57:21AM +0800, Qu Wenruo wrote:
>> Currently btrfs implements its extent buffer read-write using various
>> helpers doing cross-page handling for the pages array.
>>
>> However other filesystems like XFS is mapping the pages into kernel
>> virtual address space, greatly simplify the access.
>>
>> This patch would learn from XFS and map the pages into virtual address
>> space, if and only if the pages are not physically continuous.
>> (Note, a single page counts as physically continuous.)
>>
>> For now we only do the map, but not yet really utilize the mapped
>> address.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/extent_io.c | 70 ++++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/extent_io.h |  7 +++++
>>   2 files changed, 77 insertions(+)
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 4144c649718e..f40d48f641c0 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -14,6 +14,7 @@
>>   #include <linux/pagevec.h>
>>   #include <linux/prefetch.h>
>>   #include <linux/fsverity.h>
>> +#include <linux/vmalloc.h>
>>   #include "misc.h"
>>   #include "extent_io.h"
>>   #include "extent-io-tree.h"
>> @@ -3206,6 +3207,8 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
>>   	ASSERT(!extent_buffer_under_io(eb));
>>
>>   	num_pages = num_extent_pages(eb);
>> +	if (eb->vaddr)
>> +		vm_unmap_ram(eb->vaddr, num_pages);
>>   	for (i = 0; i < num_pages; i++) {
>>   		struct page *page = eb->pages[i];
>>
>> @@ -3255,6 +3258,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>   {
>>   	int i;
>>   	struct extent_buffer *new;
>> +	bool pages_contig = true;
>>   	int num_pages = num_extent_pages(src);
>>   	int ret;
>>
>> @@ -3279,6 +3283,9 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>   		int ret;
>>   		struct page *p = new->pages[i];
>>
>> +		if (i && p != new->pages[i - 1] + 1)
>> +			pages_contig = false;
>> +
>>   		ret = attach_extent_buffer_page(new, p, NULL);
>>   		if (ret < 0) {
>>   			btrfs_release_extent_buffer(new);
>> @@ -3286,6 +3293,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>   		}
>>   		WARN_ON(PageDirty(p));
>>   	}
>> +	if (!pages_contig) {
>> +		unsigned int nofs_flag;
>> +		int retried = 0;
>> +
>> +		nofs_flag = memalloc_nofs_save();
>> +		do {
>> +			new->vaddr = vm_map_ram(new->pages, num_pages, -1);
>> +			if (new->vaddr)
>> +				break;
>> +			vm_unmap_aliases();
>> +		} while ((retried++) <= 1);
>> +		memalloc_nofs_restore(nofs_flag);
>> +		if (!new->vaddr) {
>> +			btrfs_release_extent_buffer(new);
>> +			return NULL;
>> +		}
>> +	}
>>   	copy_extent_buffer_full(new, src);
>>   	set_extent_buffer_uptodate(new);
>>
>> @@ -3296,6 +3320,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>>   						  u64 start, unsigned long len)
>>   {
>>   	struct extent_buffer *eb;
>> +	bool pages_contig = true;
>>   	int num_pages;
>>   	int i;
>>   	int ret;
>> @@ -3312,11 +3337,29 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>>   	for (i = 0; i < num_pages; i++) {
>>   		struct page *p = eb->pages[i];
>>
>> +		if (i && p != eb->pages[i - 1] + 1)
>> +			pages_contig = false;
>
> Chances that allocated pages in eb->pages will be contiguous decrease
> over time basically to zero, because even one page out of order will
> ruin it.

I doubt this assumption.

Shouldn't things like THP requires physically contiguous pages?
Thus if your assumption is correct, then THP would not work for long
running servers at all, which doesn't look correct to me.

> This means we can assume that virtual mapping will have to be
> used almost every time.
>
> The virtual mapping can also fail and we have no fallback and there are
> two more places when allocating extent buffer can fail.

Yes, it can indeed fail, but it's only when the virtual address space is
full. This is more of a problem for 32bit systems.

Although my counter argument is, XFS is doing this for a long time, and
even XFS has much smaller metadata compared to btrfs, it doesn't has a
known problem of hitting such failure.

>
> There's alloc_pages(gfp, order) that can try to allocate contiguous
> pages of a given order, and we have nodesize always matching power of
> two so we could use it.

Should this lead to problems of exhausted contiguous pages (if that's
really true)?

> Although this also forces alignment to the same
> order, which we don't need, and adds to the failure modes.

We're migrating to reject non-nodesize aligned tree block in the long run.

I have submitted a patch to warn about such tree blocks already:
https://patchwork.kernel.org/project/linux-btrfs/patch/fee2c7df3d0a2e91e9b5e07a04242fcf28ade6bf.1690178924.git.wqu@suse.com/

Since btrfs has a similar (but less strict, just checks cross-stripe
boundary) checks a long time ago, and we haven't received such warning
for a while, I believe we can gradually move to reject such tree blocks.

Thanks,
Qu
Qu Wenruo Aug. 17, 2023, 11:32 a.m. UTC | #3
On 2023/7/28 06:24, Qu Wenruo wrote:
>
>
> On 2023/7/27 22:18, David Sterba wrote:
>> On Tue, Jul 25, 2023 at 10:57:21AM +0800, Qu Wenruo wrote:
>>> Currently btrfs implements its extent buffer read-write using various
>>> helpers doing cross-page handling for the pages array.
>>>
>>> However other filesystems like XFS is mapping the pages into kernel
>>> virtual address space, greatly simplify the access.
>>>
>>> This patch would learn from XFS and map the pages into virtual address
>>> space, if and only if the pages are not physically continuous.
>>> (Note, a single page counts as physically continuous.)
>>>
>>> For now we only do the map, but not yet really utilize the mapped
>>> address.
>>>
>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>> ---
>>>   fs/btrfs/extent_io.c | 70 ++++++++++++++++++++++++++++++++++++++++++++
>>>   fs/btrfs/extent_io.h |  7 +++++
>>>   2 files changed, 77 insertions(+)
>>>
>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>> index 4144c649718e..f40d48f641c0 100644
>>> --- a/fs/btrfs/extent_io.c
>>> +++ b/fs/btrfs/extent_io.c
>>> @@ -14,6 +14,7 @@
>>>   #include <linux/pagevec.h>
>>>   #include <linux/prefetch.h>
>>>   #include <linux/fsverity.h>
>>> +#include <linux/vmalloc.h>
>>>   #include "misc.h"
>>>   #include "extent_io.h"
>>>   #include "extent-io-tree.h"
>>> @@ -3206,6 +3207,8 @@ static void
>>> btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
>>>       ASSERT(!extent_buffer_under_io(eb));
>>>
>>>       num_pages = num_extent_pages(eb);
>>> +    if (eb->vaddr)
>>> +        vm_unmap_ram(eb->vaddr, num_pages);
>>>       for (i = 0; i < num_pages; i++) {
>>>           struct page *page = eb->pages[i];
>>>
>>> @@ -3255,6 +3258,7 @@ struct extent_buffer
>>> *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>>   {
>>>       int i;
>>>       struct extent_buffer *new;
>>> +    bool pages_contig = true;
>>>       int num_pages = num_extent_pages(src);
>>>       int ret;
>>>
>>> @@ -3279,6 +3283,9 @@ struct extent_buffer
>>> *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>>           int ret;
>>>           struct page *p = new->pages[i];
>>>
>>> +        if (i && p != new->pages[i - 1] + 1)
>>> +            pages_contig = false;
>>> +
>>>           ret = attach_extent_buffer_page(new, p, NULL);
>>>           if (ret < 0) {
>>>               btrfs_release_extent_buffer(new);
>>> @@ -3286,6 +3293,23 @@ struct extent_buffer
>>> *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>>           }
>>>           WARN_ON(PageDirty(p));
>>>       }
>>> +    if (!pages_contig) {
>>> +        unsigned int nofs_flag;
>>> +        int retried = 0;
>>> +
>>> +        nofs_flag = memalloc_nofs_save();
>>> +        do {
>>> +            new->vaddr = vm_map_ram(new->pages, num_pages, -1);
>>> +            if (new->vaddr)
>>> +                break;
>>> +            vm_unmap_aliases();
>>> +        } while ((retried++) <= 1);
>>> +        memalloc_nofs_restore(nofs_flag);
>>> +        if (!new->vaddr) {
>>> +            btrfs_release_extent_buffer(new);
>>> +            return NULL;
>>> +        }
>>> +    }
>>>       copy_extent_buffer_full(new, src);
>>>       set_extent_buffer_uptodate(new);
>>>
>>> @@ -3296,6 +3320,7 @@ struct extent_buffer
>>> *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>>>                             u64 start, unsigned long len)
>>>   {
>>>       struct extent_buffer *eb;
>>> +    bool pages_contig = true;
>>>       int num_pages;
>>>       int i;
>>>       int ret;
>>> @@ -3312,11 +3337,29 @@ struct extent_buffer
>>> *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>>>       for (i = 0; i < num_pages; i++) {
>>>           struct page *p = eb->pages[i];
>>>
>>> +        if (i && p != eb->pages[i - 1] + 1)
>>> +            pages_contig = false;
>>
>> Chances that allocated pages in eb->pages will be contiguous decrease
>> over time basically to zero, because even one page out of order will
>> ruin it.
>
> I doubt this assumption.
>
> Shouldn't things like THP requires physically contiguous pages?
> Thus if your assumption is correct, then THP would not work for long
> running servers at all, which doesn't look correct to me.
>
>> This means we can assume that virtual mapping will have to be
>> used almost every time.
>>
>> The virtual mapping can also fail and we have no fallback and there are
>> two more places when allocating extent buffer can fail.
>
> Yes, it can indeed fail, but it's only when the virtual address space is
> full. This is more of a problem for 32bit systems.
>
> Although my counter argument is, XFS is doing this for a long time, and
> even XFS has much smaller metadata compared to btrfs, it doesn't has a
> known problem of hitting such failure.
>
>>
>> There's alloc_pages(gfp, order) that can try to allocate contiguous
>> pages of a given order, and we have nodesize always matching power of
>> two so we could use it.
>
> Should this lead to problems of exhausted contiguous pages (if that's
> really true)?
>
>> Although this also forces alignment to the same
>> order, which we don't need, and adds to the failure modes.
>
> We're migrating to reject non-nodesize aligned tree block in the long run.
>
> I have submitted a patch to warn about such tree blocks already:
> https://patchwork.kernel.org/project/linux-btrfs/patch/fee2c7df3d0a2e91e9b5e07a04242fcf28ade6bf.1690178924.git.wqu@suse.com/
>
> Since btrfs has a similar (but less strict, just checks cross-stripe
> boundary) checks a long time ago, and we haven't received such warning
> for a while, I believe we can gradually move to reject such tree blocks.

Any extra feedback? Without this series, it's much harder to go folio
for eb pages.

Thanks,
Qu
>
> Thanks,
> Qu
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4144c649718e..f40d48f641c0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,6 +14,7 @@ 
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
 #include <linux/fsverity.h>
+#include <linux/vmalloc.h>
 #include "misc.h"
 #include "extent_io.h"
 #include "extent-io-tree.h"
@@ -3206,6 +3207,8 @@  static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
 	ASSERT(!extent_buffer_under_io(eb));
 
 	num_pages = num_extent_pages(eb);
+	if (eb->vaddr)
+		vm_unmap_ram(eb->vaddr, num_pages);
 	for (i = 0; i < num_pages; i++) {
 		struct page *page = eb->pages[i];
 
@@ -3255,6 +3258,7 @@  struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 {
 	int i;
 	struct extent_buffer *new;
+	bool pages_contig = true;
 	int num_pages = num_extent_pages(src);
 	int ret;
 
@@ -3279,6 +3283,9 @@  struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 		int ret;
 		struct page *p = new->pages[i];
 
+		if (i && p != new->pages[i - 1] + 1)
+			pages_contig = false;
+
 		ret = attach_extent_buffer_page(new, p, NULL);
 		if (ret < 0) {
 			btrfs_release_extent_buffer(new);
@@ -3286,6 +3293,23 @@  struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 		}
 		WARN_ON(PageDirty(p));
 	}
+	if (!pages_contig) {
+		unsigned int nofs_flag;
+		int retried = 0;
+
+		nofs_flag = memalloc_nofs_save();
+		do {
+			new->vaddr = vm_map_ram(new->pages, num_pages, -1);
+			if (new->vaddr)
+				break;
+			vm_unmap_aliases();
+		} while ((retried++) <= 1);
+		memalloc_nofs_restore(nofs_flag);
+		if (!new->vaddr) {
+			btrfs_release_extent_buffer(new);
+			return NULL;
+		}
+	}
 	copy_extent_buffer_full(new, src);
 	set_extent_buffer_uptodate(new);
 
@@ -3296,6 +3320,7 @@  struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 						  u64 start, unsigned long len)
 {
 	struct extent_buffer *eb;
+	bool pages_contig = true;
 	int num_pages;
 	int i;
 	int ret;
@@ -3312,11 +3337,29 @@  struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = eb->pages[i];
 
+		if (i && p != eb->pages[i - 1] + 1)
+			pages_contig = false;
+
 		ret = attach_extent_buffer_page(eb, p, NULL);
 		if (ret < 0)
 			goto err;
 	}
 
+	if (!pages_contig) {
+		unsigned int nofs_flag;
+		int retried = 0;
+
+		nofs_flag = memalloc_nofs_save();
+		do {
+			eb->vaddr = vm_map_ram(eb->pages, num_pages, -1);
+			if (eb->vaddr)
+				break;
+			vm_unmap_aliases();
+		} while ((retried++) <= 1);
+		memalloc_nofs_restore(nofs_flag);
+		if (!eb->vaddr)
+			goto err;
+	}
 	set_extent_buffer_uptodate(eb);
 	btrfs_set_header_nritems(eb, 0);
 	set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -3539,6 +3582,7 @@  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	struct address_space *mapping = fs_info->btree_inode->i_mapping;
 	struct btrfs_subpage *prealloc = NULL;
 	u64 lockdep_owner = owner_root;
+	bool pages_contig = true;
 	int uptodate = 1;
 	int ret;
 
@@ -3611,6 +3655,10 @@  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		/* Should not fail, as we have preallocated the memory */
 		ret = attach_extent_buffer_page(eb, p, prealloc);
 		ASSERT(!ret);
+
+		if (i && p != eb->pages[i - 1] + 1)
+			pages_contig = false;
+
 		/*
 		 * To inform we have extra eb under allocation, so that
 		 * detach_extent_buffer_page() won't release the page private
@@ -3636,6 +3684,28 @@  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		 * we could crash.
 		 */
 	}
+
+	/*
+	 * If pages are not continuous, here we map it into a continuous virtual
+	 * range to make later access easier.
+	 */
+	if (!pages_contig) {
+		unsigned int nofs_flag;
+		int retried = 0;
+
+		nofs_flag = memalloc_nofs_save();
+		do {
+			eb->vaddr = vm_map_ram(eb->pages, num_pages, -1);
+			if (eb->vaddr)
+				break;
+			vm_unmap_aliases();
+		} while ((retried++) <= 1);
+		memalloc_nofs_restore(nofs_flag);
+		if (!eb->vaddr) {
+			exists = ERR_PTR(-ENOMEM);
+			goto free_eb;
+		}
+	}
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 again:
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5966d810af7b..f1505c3a05cc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -88,6 +88,13 @@  struct extent_buffer {
 
 	struct rw_semaphore lock;
 
+	/*
+	 * For virtually mapped address.
+	 *
+	 * NULL if the pages are physically continuous.
+	 */
+	void *vaddr;
+
 	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;