diff mbox series

[v4,04/18] btrfs: make attach_extent_buffer_page() to handle subpage case

Message ID 20210116071533.105780-5-wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: add read-only support for subpage sector size | expand

Commit Message

Qu Wenruo Jan. 16, 2021, 7:15 a.m. UTC
For subpage case, we need to allocate new memory for each metadata page.

So we need to:
- Allow attach_extent_buffer_page() to return int
  To indicate allocation failure

- Prealloc btrfs_subpage structure for alloc_extent_buffer()
  We don't want to call memory allocation with spinlock hold, so
  do preallocation before we acquire mapping->private_lock.

- Handle subpage and regular case differently in
  attach_extent_buffer_page()
  For regular case, just do the usual thing.
  For subpage case, allocate new memory or use the preallocated memory.

For future subpage metadata, we will make more usage of radix tree to
grab extnet buffer.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/extent_io.c | 75 ++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/subpage.h   | 17 ++++++++++
 2 files changed, 82 insertions(+), 10 deletions(-)

Comments

David Sterba Jan. 18, 2021, 10:51 p.m. UTC | #1
On Sat, Jan 16, 2021 at 03:15:19PM +0800, Qu Wenruo wrote:
> For subpage case, we need to allocate new memory for each metadata page.
> 
> So we need to:
> - Allow attach_extent_buffer_page() to return int
>   To indicate allocation failure
> 
> - Prealloc btrfs_subpage structure for alloc_extent_buffer()
>   We don't want to call memory allocation with spinlock hold, so
>   do preallocation before we acquire mapping->private_lock.
> 
> - Handle subpage and regular case differently in
>   attach_extent_buffer_page()
>   For regular case, just do the usual thing.
>   For subpage case, allocate new memory or use the preallocated memory.
> 
> For future subpage metadata, we will make more usage of radix tree to
> grab extnet buffer.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/extent_io.c | 75 ++++++++++++++++++++++++++++++++++++++------
>  fs/btrfs/subpage.h   | 17 ++++++++++
>  2 files changed, 82 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index a816ba4a8537..320731487ac0 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -24,6 +24,7 @@
>  #include "rcu-string.h"
>  #include "backref.h"
>  #include "disk-io.h"
> +#include "subpage.h"
>  
>  static struct kmem_cache *extent_state_cache;
>  static struct kmem_cache *extent_buffer_cache;
> @@ -3140,9 +3141,13 @@ static int submit_extent_page(unsigned int opf,
>  	return ret;
>  }
>  
> -static void attach_extent_buffer_page(struct extent_buffer *eb,
> -				      struct page *page)
> +static int attach_extent_buffer_page(struct extent_buffer *eb,
> +				      struct page *page,
> +				      struct btrfs_subpage *prealloc)
>  {
> +	struct btrfs_fs_info *fs_info = eb->fs_info;
> +	int ret;
> +
>  	/*
>  	 * If the page is mapped to btree inode, we should hold the private
>  	 * lock to prevent race.
> @@ -3152,10 +3157,32 @@ static void attach_extent_buffer_page(struct extent_buffer *eb,
>  	if (page->mapping)
>  		lockdep_assert_held(&page->mapping->private_lock);
>  
> -	if (!PagePrivate(page))
> -		attach_page_private(page, eb);
> -	else
> -		WARN_ON(page->private != (unsigned long)eb);
> +	if (fs_info->sectorsize == PAGE_SIZE) {
> +		if (!PagePrivate(page))
> +			attach_page_private(page, eb);
> +		else
> +			WARN_ON(page->private != (unsigned long)eb);
> +		return 0;
> +	}
> +
> +	/* Already mapped, just free prealloc */
> +	if (PagePrivate(page)) {
> +		kfree(prealloc);
> +		return 0;
> +	}
> +
> +	if (prealloc) {
> +		/* Has preallocated memory for subpage */
> +		spin_lock_init(&prealloc->lock);
> +		attach_page_private(page, prealloc);
> +	} else {
> +		/* Do new allocation to attach subpage */
> +		ret = btrfs_attach_subpage(fs_info, page);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
> +	return 0;
>  }
>  
>  void set_page_extent_mapped(struct page *page)
> @@ -5062,21 +5089,29 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>  	if (new == NULL)
>  		return NULL;
>  
> +	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
> +	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
> +
>  	for (i = 0; i < num_pages; i++) {
> +		int ret;
> +
>  		p = alloc_page(GFP_NOFS);
>  		if (!p) {
>  			btrfs_release_extent_buffer(new);
>  			return NULL;
>  		}
> -		attach_extent_buffer_page(new, p);
> +		ret = attach_extent_buffer_page(new, p, NULL);
> +		if (ret < 0) {
> +			put_page(p);
> +			btrfs_release_extent_buffer(new);
> +			return NULL;
> +		}
>  		WARN_ON(PageDirty(p));
>  		SetPageUptodate(p);
>  		new->pages[i] = p;
>  		copy_page(page_address(p), page_address(src->pages[i]));
>  	}
>  
> -	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
> -	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>  
>  	return new;
>  }
> @@ -5308,12 +5343,28 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  
>  	num_pages = num_extent_pages(eb);
>  	for (i = 0; i < num_pages; i++, index++) {
> +		struct btrfs_subpage *prealloc = NULL;
> +
>  		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
>  		if (!p) {
>  			exists = ERR_PTR(-ENOMEM);
>  			goto free_eb;
>  		}
>  
> +		/*
> +		 * Preallocate page->private for subpage case, so that
> +		 * we won't allocate memory with private_lock hold.
> +		 * The memory will be freed by attach_extent_buffer_page() or
> +		 * freed manually if exit earlier.
> +		 */
> +		ret = btrfs_alloc_subpage(fs_info, &prealloc);
> +		if (ret < 0) {
> +			unlock_page(p);
> +			put_page(p);
> +			exists = ERR_PTR(ret);
> +			goto free_eb;
> +		}
> +
>  		spin_lock(&mapping->private_lock);
>  		exists = grab_extent_buffer(p);
>  		if (exists) {
> @@ -5321,10 +5372,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  			unlock_page(p);
>  			put_page(p);
>  			mark_extent_buffer_accessed(exists, p);
> +			kfree(prealloc);
>  			goto free_eb;
>  		}
> -		attach_extent_buffer_page(eb, p);
> +		/* Should not fail, as we have preallocated the memory */
> +		ret = attach_extent_buffer_page(eb, p, prealloc);
> +		ASSERT(!ret);
>  		spin_unlock(&mapping->private_lock);
> +
>  		WARN_ON(PageDirty(p));
>  		eb->pages[i] = p;
>  		if (!PageUptodate(p))
> diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
> index 96f3b226913e..f701256dd1e2 100644
> --- a/fs/btrfs/subpage.h
> +++ b/fs/btrfs/subpage.h
> @@ -23,8 +23,25 @@
>  struct btrfs_subpage {
>  	/* Common members for both data and metadata pages */
>  	spinlock_t lock;
> +	union {
> +		/* Structures only used by metadata */
> +		/* Structures only used by data */
> +	};
>  };
>  
> +/* For rare cases where we need to pre-allocate a btrfs_subpage structure */

Function comments should start with "what it does", so something like

"Allocate additional page data for cases where page represents more than
one block"

Imagine somebody is reading the code and can't say what the function
does just by the name, then goes to the comment. And then the comment is
supposed to answer that.

> +static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
> +				      struct btrfs_subpage **ret)
> +{
> +	if (fs_info->sectorsize == PAGE_SIZE)
> +		return 0;
> +
> +	*ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
> +	if (!*ret)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
>  int btrfs_attach_subpage(struct btrfs_fs_info *fs_info, struct page *page);
>  void btrfs_detach_subpage(struct btrfs_fs_info *fs_info, struct page *page);
>  
> -- 
> 2.30.0
Josef Bacik Jan. 19, 2021, 9:54 p.m. UTC | #2
On 1/16/21 2:15 AM, Qu Wenruo wrote:
> For subpage case, we need to allocate new memory for each metadata page.
> 
> So we need to:
> - Allow attach_extent_buffer_page() to return int
>    To indicate allocation failure
> 
> - Prealloc btrfs_subpage structure for alloc_extent_buffer()
>    We don't want to call memory allocation with spinlock hold, so
>    do preallocation before we acquire mapping->private_lock.
> 
> - Handle subpage and regular case differently in
>    attach_extent_buffer_page()
>    For regular case, just do the usual thing.
>    For subpage case, allocate new memory or use the preallocated memory.
> 
> For future subpage metadata, we will make more usage of radix tree to
> grab extnet buffer.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>   fs/btrfs/extent_io.c | 75 ++++++++++++++++++++++++++++++++++++++------
>   fs/btrfs/subpage.h   | 17 ++++++++++
>   2 files changed, 82 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index a816ba4a8537..320731487ac0 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -24,6 +24,7 @@
>   #include "rcu-string.h"
>   #include "backref.h"
>   #include "disk-io.h"
> +#include "subpage.h"
>   
>   static struct kmem_cache *extent_state_cache;
>   static struct kmem_cache *extent_buffer_cache;
> @@ -3140,9 +3141,13 @@ static int submit_extent_page(unsigned int opf,
>   	return ret;
>   }
>   
> -static void attach_extent_buffer_page(struct extent_buffer *eb,
> -				      struct page *page)
> +static int attach_extent_buffer_page(struct extent_buffer *eb,
> +				      struct page *page,
> +				      struct btrfs_subpage *prealloc)
>   {
> +	struct btrfs_fs_info *fs_info = eb->fs_info;
> +	int ret;

int ret = 0;

> +
>   	/*
>   	 * If the page is mapped to btree inode, we should hold the private
>   	 * lock to prevent race.
> @@ -3152,10 +3157,32 @@ static void attach_extent_buffer_page(struct extent_buffer *eb,
>   	if (page->mapping)
>   		lockdep_assert_held(&page->mapping->private_lock);
>   
> -	if (!PagePrivate(page))
> -		attach_page_private(page, eb);
> -	else
> -		WARN_ON(page->private != (unsigned long)eb);
> +	if (fs_info->sectorsize == PAGE_SIZE) {
> +		if (!PagePrivate(page))
> +			attach_page_private(page, eb);
> +		else
> +			WARN_ON(page->private != (unsigned long)eb);
> +		return 0;
> +	}
> +
> +	/* Already mapped, just free prealloc */
> +	if (PagePrivate(page)) {
> +		kfree(prealloc);
> +		return 0;
> +	}
> +
> +	if (prealloc) {
> +		/* Has preallocated memory for subpage */
> +		spin_lock_init(&prealloc->lock);
> +		attach_page_private(page, prealloc);
> +	} else {
> +		/* Do new allocation to attach subpage */
> +		ret = btrfs_attach_subpage(fs_info, page);
> +		if (ret < 0)
> +			return ret;

Delete the above 2 lines.

> +	}
> +
> +	return 0;

return ret;

>   }
>   
>   void set_page_extent_mapped(struct page *page)
> @@ -5062,21 +5089,29 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>   	if (new == NULL)
>   		return NULL;
>   
> +	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
> +	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
> +

Why are you doing this here?  It seems unrelated?  Looking at the code it 
appears there's a reason for this later, but I had to go look to make sure I 
wasn't crazy, so at the very least it needs to be done in a more relevant patch.

>   	for (i = 0; i < num_pages; i++) {
> +		int ret;
> +
>   		p = alloc_page(GFP_NOFS);
>   		if (!p) {
>   			btrfs_release_extent_buffer(new);
>   			return NULL;
>   		}
> -		attach_extent_buffer_page(new, p);
> +		ret = attach_extent_buffer_page(new, p, NULL);
> +		if (ret < 0) {
> +			put_page(p);
> +			btrfs_release_extent_buffer(new);
> +			return NULL;
> +		}
>   		WARN_ON(PageDirty(p));
>   		SetPageUptodate(p);
>   		new->pages[i] = p;
>   		copy_page(page_address(p), page_address(src->pages[i]));
>   	}
>   
> -	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
> -	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>   
>   	return new;
>   }
> @@ -5308,12 +5343,28 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>   
>   	num_pages = num_extent_pages(eb);
>   	for (i = 0; i < num_pages; i++, index++) {
> +		struct btrfs_subpage *prealloc = NULL;
> +
>   		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
>   		if (!p) {
>   			exists = ERR_PTR(-ENOMEM);
>   			goto free_eb;
>   		}
>   
> +		/*
> +		 * Preallocate page->private for subpage case, so that
> +		 * we won't allocate memory with private_lock hold.
> +		 * The memory will be freed by attach_extent_buffer_page() or
> +		 * freed manually if exit earlier.
> +		 */
> +		ret = btrfs_alloc_subpage(fs_info, &prealloc);
> +		if (ret < 0) {
> +			unlock_page(p);
> +			put_page(p);
> +			exists = ERR_PTR(ret);
> +			goto free_eb;
> +		}
> +

I realize that for subpage sectorsize we'll only have 1 page, but I'd still 
rather see this outside of the for loop, just for clarity sake.

>   		spin_lock(&mapping->private_lock);
>   		exists = grab_extent_buffer(p);
>   		if (exists) {
> @@ -5321,10 +5372,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>   			unlock_page(p);
>   			put_page(p);
>   			mark_extent_buffer_accessed(exists, p);
> +			kfree(prealloc);
>   			goto free_eb;
>   		}
> -		attach_extent_buffer_page(eb, p);
> +		/* Should not fail, as we have preallocated the memory */
> +		ret = attach_extent_buffer_page(eb, p, prealloc);
> +		ASSERT(!ret);
>   		spin_unlock(&mapping->private_lock);
> +
>   		WARN_ON(PageDirty(p));
>   		eb->pages[i] = p;
>   		if (!PageUptodate(p))
> diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
> index 96f3b226913e..f701256dd1e2 100644
> --- a/fs/btrfs/subpage.h
> +++ b/fs/btrfs/subpage.h
> @@ -23,8 +23,25 @@
>   struct btrfs_subpage {
>   	/* Common members for both data and metadata pages */
>   	spinlock_t lock;
> +	union {
> +		/* Structures only used by metadata */
> +		/* Structures only used by data */
> +	};
>   };
>   
> +/* For rare cases where we need to pre-allocate a btrfs_subpage structure */
> +static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
> +				      struct btrfs_subpage **ret)
> +{
> +	if (fs_info->sectorsize == PAGE_SIZE)
> +		return 0;
> +
> +	*ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
> +	if (!*ret)
> +		return -ENOMEM;
> +	return 0;
> +}

We're allocating these for every metadata page, that deserves a dedicated 
kmem_cache.  Thanks,

Josef
David Sterba Jan. 19, 2021, 10:35 p.m. UTC | #3
On Tue, Jan 19, 2021 at 04:54:28PM -0500, Josef Bacik wrote:
> On 1/16/21 2:15 AM, Qu Wenruo wrote:
> > +/* For rare cases where we need to pre-allocate a btrfs_subpage structure */
> > +static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
> > +				      struct btrfs_subpage **ret)
> > +{
> > +	if (fs_info->sectorsize == PAGE_SIZE)
> > +		return 0;
> > +
> > +	*ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
> > +	if (!*ret)
> > +		return -ENOMEM;
> > +	return 0;
> > +}
> 
> We're allocating these for every metadata page, that deserves a dedicated 
> kmem_cache.  Thanks,

I'm not opposed to that idea but for the first implementation I'm ok
with using the default slabs. As the subpage support depends on the
filesystem, creating the cache unconditionally would waste resources and
creating it on demand would need some care. Either way I'd rather see it
in a separate patch.
Qu Wenruo Jan. 20, 2021, 12:27 a.m. UTC | #4
On 2021/1/20 上午5:54, Josef Bacik wrote:
> On 1/16/21 2:15 AM, Qu Wenruo wrote:
>> For subpage case, we need to allocate new memory for each metadata page.
>>
>> So we need to:
>> - Allow attach_extent_buffer_page() to return int
>>    To indicate allocation failure
>>
>> - Prealloc btrfs_subpage structure for alloc_extent_buffer()
>>    We don't want to call memory allocation with spinlock hold, so
>>    do preallocation before we acquire mapping->private_lock.
>>
>> - Handle subpage and regular case differently in
>>    attach_extent_buffer_page()
>>    For regular case, just do the usual thing.
>>    For subpage case, allocate new memory or use the preallocated memory.
>>
>> For future subpage metadata, we will make more usage of radix tree to
>> grab extnet buffer.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/extent_io.c | 75 ++++++++++++++++++++++++++++++++++++++------
>>   fs/btrfs/subpage.h   | 17 ++++++++++
>>   2 files changed, 82 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index a816ba4a8537..320731487ac0 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -24,6 +24,7 @@
>>   #include "rcu-string.h"
>>   #include "backref.h"
>>   #include "disk-io.h"
>> +#include "subpage.h"
>>   static struct kmem_cache *extent_state_cache;
>>   static struct kmem_cache *extent_buffer_cache;
>> @@ -3140,9 +3141,13 @@ static int submit_extent_page(unsigned int opf,
>>       return ret;
>>   }
>> -static void attach_extent_buffer_page(struct extent_buffer *eb,
>> -                      struct page *page)
>> +static int attach_extent_buffer_page(struct extent_buffer *eb,
>> +                      struct page *page,
>> +                      struct btrfs_subpage *prealloc)
>>   {
>> +    struct btrfs_fs_info *fs_info = eb->fs_info;
>> +    int ret;
> 
> int ret = 0;
> 
>> +
>>       /*
>>        * If the page is mapped to btree inode, we should hold the private
>>        * lock to prevent race.
>> @@ -3152,10 +3157,32 @@ static void attach_extent_buffer_page(struct 
>> extent_buffer *eb,
>>       if (page->mapping)
>>           lockdep_assert_held(&page->mapping->private_lock);
>> -    if (!PagePrivate(page))
>> -        attach_page_private(page, eb);
>> -    else
>> -        WARN_ON(page->private != (unsigned long)eb);
>> +    if (fs_info->sectorsize == PAGE_SIZE) {
>> +        if (!PagePrivate(page))
>> +            attach_page_private(page, eb);
>> +        else
>> +            WARN_ON(page->private != (unsigned long)eb);
>> +        return 0;
>> +    }
>> +
>> +    /* Already mapped, just free prealloc */
>> +    if (PagePrivate(page)) {
>> +        kfree(prealloc);
>> +        return 0;
>> +    }
>> +
>> +    if (prealloc) {
>> +        /* Has preallocated memory for subpage */
>> +        spin_lock_init(&prealloc->lock);
>> +        attach_page_private(page, prealloc);
>> +    } else {
>> +        /* Do new allocation to attach subpage */
>> +        ret = btrfs_attach_subpage(fs_info, page);
>> +        if (ret < 0)
>> +            return ret;
> 
> Delete the above 2 lines.
> 
>> +    }
>> +
>> +    return 0;
> 
> return ret;
> 
>>   }
>>   void set_page_extent_mapped(struct page *page)
>> @@ -5062,21 +5089,29 @@ struct extent_buffer 
>> *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>       if (new == NULL)
>>           return NULL;
>> +    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
>> +    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>> +
> 
> Why are you doing this here?  It seems unrelated?  Looking at the code 
> it appears there's a reason for this later, but I had to go look to make 
> sure I wasn't crazy, so at the very least it needs to be done in a more 
> relevant patch.

This is to handle case where we allocated a page but failed to allocate 
subpage structure.

In that case, btrfs_release_extent_buffer() will go different routine to 
free the eb.

Without UNMAPPED bit, it just go wrong without knowing it's a unmapped eb.

This change is mostly due to the extra failure pattern introduced by the 
subpage memory allocation.

> 
>>       for (i = 0; i < num_pages; i++) {
>> +        int ret;
>> +
>>           p = alloc_page(GFP_NOFS);
>>           if (!p) {
>>               btrfs_release_extent_buffer(new);
>>               return NULL;
>>           }
>> -        attach_extent_buffer_page(new, p);
>> +        ret = attach_extent_buffer_page(new, p, NULL);
>> +        if (ret < 0) {
>> +            put_page(p);
>> +            btrfs_release_extent_buffer(new);
>> +            return NULL;
>> +        }
>>           WARN_ON(PageDirty(p));
>>           SetPageUptodate(p);
>>           new->pages[i] = p;
>>           copy_page(page_address(p), page_address(src->pages[i]));
>>       }
>> -    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
>> -    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>>       return new;
>>   }
>> @@ -5308,12 +5343,28 @@ struct extent_buffer 
>> *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>       num_pages = num_extent_pages(eb);
>>       for (i = 0; i < num_pages; i++, index++) {
>> +        struct btrfs_subpage *prealloc = NULL;
>> +
>>           p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
>>           if (!p) {
>>               exists = ERR_PTR(-ENOMEM);
>>               goto free_eb;
>>           }
>> +        /*
>> +         * Preallocate page->private for subpage case, so that
>> +         * we won't allocate memory with private_lock hold.
>> +         * The memory will be freed by attach_extent_buffer_page() or
>> +         * freed manually if exit earlier.
>> +         */
>> +        ret = btrfs_alloc_subpage(fs_info, &prealloc);
>> +        if (ret < 0) {
>> +            unlock_page(p);
>> +            put_page(p);
>> +            exists = ERR_PTR(ret);
>> +            goto free_eb;
>> +        }
>> +
> 
> I realize that for subpage sectorsize we'll only have 1 page, but I'd 
> still rather see this outside of the for loop, just for clarity sake.

This is the trade-off.
Either we do every separately, sharing the minimal amount of code (and 
need extra for loop for future 16K pages), or using the same loop 
sacrifice a little readability.

Here I'd say sharing more code is not that a big deal.

> 
>>           spin_lock(&mapping->private_lock);
>>           exists = grab_extent_buffer(p);
>>           if (exists) {
>> @@ -5321,10 +5372,14 @@ struct extent_buffer 
>> *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>               unlock_page(p);
>>               put_page(p);
>>               mark_extent_buffer_accessed(exists, p);
>> +            kfree(prealloc);
>>               goto free_eb;
>>           }
>> -        attach_extent_buffer_page(eb, p);
>> +        /* Should not fail, as we have preallocated the memory */
>> +        ret = attach_extent_buffer_page(eb, p, prealloc);
>> +        ASSERT(!ret);
>>           spin_unlock(&mapping->private_lock);
>> +
>>           WARN_ON(PageDirty(p));
>>           eb->pages[i] = p;
>>           if (!PageUptodate(p))
>> diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
>> index 96f3b226913e..f701256dd1e2 100644
>> --- a/fs/btrfs/subpage.h
>> +++ b/fs/btrfs/subpage.h
>> @@ -23,8 +23,25 @@
>>   struct btrfs_subpage {
>>       /* Common members for both data and metadata pages */
>>       spinlock_t lock;
>> +    union {
>> +        /* Structures only used by metadata */
>> +        /* Structures only used by data */
>> +    };
>>   };
>> +/* For rare cases where we need to pre-allocate a btrfs_subpage 
>> structure */
>> +static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
>> +                      struct btrfs_subpage **ret)
>> +{
>> +    if (fs_info->sectorsize == PAGE_SIZE)
>> +        return 0;
>> +
>> +    *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
>> +    if (!*ret)
>> +        return -ENOMEM;
>> +    return 0;
>> +}
> 
> We're allocating these for every metadata page, that deserves a 
> dedicated kmem_cache.  Thanks,

That makes sense, especially it will go both data and metadata for subpage.

Thanks,
Qu

> 
> Josef
Josef Bacik Jan. 20, 2021, 2:22 p.m. UTC | #5
On 1/19/21 7:27 PM, Qu Wenruo wrote:
> 
> 
> On 2021/1/20 上午5:54, Josef Bacik wrote:
>> On 1/16/21 2:15 AM, Qu Wenruo wrote:
>>> For subpage case, we need to allocate new memory for each metadata page.
>>>
>>> So we need to:
>>> - Allow attach_extent_buffer_page() to return int
>>>    To indicate allocation failure
>>>
>>> - Prealloc btrfs_subpage structure for alloc_extent_buffer()
>>>    We don't want to call memory allocation with spinlock hold, so
>>>    do preallocation before we acquire mapping->private_lock.
>>>
>>> - Handle subpage and regular case differently in
>>>    attach_extent_buffer_page()
>>>    For regular case, just do the usual thing.
>>>    For subpage case, allocate new memory or use the preallocated memory.
>>>
>>> For future subpage metadata, we will make more usage of radix tree to
>>> grab extnet buffer.
>>>
>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>> ---
>>>   fs/btrfs/extent_io.c | 75 ++++++++++++++++++++++++++++++++++++++------
>>>   fs/btrfs/subpage.h   | 17 ++++++++++
>>>   2 files changed, 82 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>> index a816ba4a8537..320731487ac0 100644
>>> --- a/fs/btrfs/extent_io.c
>>> +++ b/fs/btrfs/extent_io.c
>>> @@ -24,6 +24,7 @@
>>>   #include "rcu-string.h"
>>>   #include "backref.h"
>>>   #include "disk-io.h"
>>> +#include "subpage.h"
>>>   static struct kmem_cache *extent_state_cache;
>>>   static struct kmem_cache *extent_buffer_cache;
>>> @@ -3140,9 +3141,13 @@ static int submit_extent_page(unsigned int opf,
>>>       return ret;
>>>   }
>>> -static void attach_extent_buffer_page(struct extent_buffer *eb,
>>> -                      struct page *page)
>>> +static int attach_extent_buffer_page(struct extent_buffer *eb,
>>> +                      struct page *page,
>>> +                      struct btrfs_subpage *prealloc)
>>>   {
>>> +    struct btrfs_fs_info *fs_info = eb->fs_info;
>>> +    int ret;
>>
>> int ret = 0;
>>
>>> +
>>>       /*
>>>        * If the page is mapped to btree inode, we should hold the private
>>>        * lock to prevent race.
>>> @@ -3152,10 +3157,32 @@ static void attach_extent_buffer_page(struct 
>>> extent_buffer *eb,
>>>       if (page->mapping)
>>>           lockdep_assert_held(&page->mapping->private_lock);
>>> -    if (!PagePrivate(page))
>>> -        attach_page_private(page, eb);
>>> -    else
>>> -        WARN_ON(page->private != (unsigned long)eb);
>>> +    if (fs_info->sectorsize == PAGE_SIZE) {
>>> +        if (!PagePrivate(page))
>>> +            attach_page_private(page, eb);
>>> +        else
>>> +            WARN_ON(page->private != (unsigned long)eb);
>>> +        return 0;
>>> +    }
>>> +
>>> +    /* Already mapped, just free prealloc */
>>> +    if (PagePrivate(page)) {
>>> +        kfree(prealloc);
>>> +        return 0;
>>> +    }
>>> +
>>> +    if (prealloc) {
>>> +        /* Has preallocated memory for subpage */
>>> +        spin_lock_init(&prealloc->lock);
>>> +        attach_page_private(page, prealloc);
>>> +    } else {
>>> +        /* Do new allocation to attach subpage */
>>> +        ret = btrfs_attach_subpage(fs_info, page);
>>> +        if (ret < 0)
>>> +            return ret;
>>
>> Delete the above 2 lines.
>>
>>> +    }
>>> +
>>> +    return 0;
>>
>> return ret;
>>
>>>   }
>>>   void set_page_extent_mapped(struct page *page)
>>> @@ -5062,21 +5089,29 @@ struct extent_buffer *btrfs_clone_extent_buffer(const 
>>> struct extent_buffer *src)
>>>       if (new == NULL)
>>>           return NULL;
>>> +    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
>>> +    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>>> +
>>
>> Why are you doing this here?  It seems unrelated?  Looking at the code it 
>> appears there's a reason for this later, but I had to go look to make sure I 
>> wasn't crazy, so at the very least it needs to be done in a more relevant patch.
> 
> This is to handle case where we allocated a page but failed to allocate subpage 
> structure.
> 
> In that case, btrfs_release_extent_buffer() will go different routine to free 
> the eb.
> 
> Without UNMAPPED bit, it just go wrong without knowing it's a unmapped eb.
> 
> This change is mostly due to the extra failure pattern introduced by the subpage 
> memory allocation.
> 

Yes, but my point is it's unrelated to this change, and in fact the problem 
exists outside of your changes, so it needs to be addressed in its own patch 
with its own changelog.

>>
>>>       for (i = 0; i < num_pages; i++) {
>>> +        int ret;
>>> +
>>>           p = alloc_page(GFP_NOFS);
>>>           if (!p) {
>>>               btrfs_release_extent_buffer(new);
>>>               return NULL;
>>>           }
>>> -        attach_extent_buffer_page(new, p);
>>> +        ret = attach_extent_buffer_page(new, p, NULL);
>>> +        if (ret < 0) {
>>> +            put_page(p);
>>> +            btrfs_release_extent_buffer(new);
>>> +            return NULL;
>>> +        }
>>>           WARN_ON(PageDirty(p));
>>>           SetPageUptodate(p);
>>>           new->pages[i] = p;
>>>           copy_page(page_address(p), page_address(src->pages[i]));
>>>       }
>>> -    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
>>> -    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>>>       return new;
>>>   }
>>> @@ -5308,12 +5343,28 @@ struct extent_buffer *alloc_extent_buffer(struct 
>>> btrfs_fs_info *fs_info,
>>>       num_pages = num_extent_pages(eb);
>>>       for (i = 0; i < num_pages; i++, index++) {
>>> +        struct btrfs_subpage *prealloc = NULL;
>>> +
>>>           p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
>>>           if (!p) {
>>>               exists = ERR_PTR(-ENOMEM);
>>>               goto free_eb;
>>>           }
>>> +        /*
>>> +         * Preallocate page->private for subpage case, so that
>>> +         * we won't allocate memory with private_lock hold.
>>> +         * The memory will be freed by attach_extent_buffer_page() or
>>> +         * freed manually if exit earlier.
>>> +         */
>>> +        ret = btrfs_alloc_subpage(fs_info, &prealloc);
>>> +        if (ret < 0) {
>>> +            unlock_page(p);
>>> +            put_page(p);
>>> +            exists = ERR_PTR(ret);
>>> +            goto free_eb;
>>> +        }
>>> +
>>
>> I realize that for subpage sectorsize we'll only have 1 page, but I'd still 
>> rather see this outside of the for loop, just for clarity sake.
> 
> This is the trade-off.
> Either we do every separately, sharing the minimal amount of code (and need 
> extra for loop for future 16K pages), or using the same loop sacrifice a little 
> readability.
> 
> Here I'd say sharing more code is not that a big deal.
> 

It's not a tradeoff, it's confusing.  What I'm suggesting is you do

ret = btrfs_alloc_subpage(fs_info, &prealloc);
if (ret) {
	exists = ERR_PTR(ret);
	goto free_eb;
}
for (i = 0; i < num_pages; i++, index++) {
}

free_eb:
	kmem_cache_free(prealloc);

The subpage portion is part of the eb itself, and there's one per eb, and thus 
should be pre-allocated outside of the loop that is doing the page lookup, as 
it's logically a different thing.  Thanks,

Josef
Qu Wenruo Jan. 21, 2021, 1:20 a.m. UTC | #6
On 2021/1/20 下午10:22, Josef Bacik wrote:
> On 1/19/21 7:27 PM, Qu Wenruo wrote:
>>
>>
>> On 2021/1/20 上午5:54, Josef Bacik wrote:
>>> On 1/16/21 2:15 AM, Qu Wenruo wrote:
>>>> For subpage case, we need to allocate new memory for each metadata 
>>>> page.
>>>>
>>>> So we need to:
>>>> - Allow attach_extent_buffer_page() to return int
>>>>    To indicate allocation failure
>>>>
>>>> - Prealloc btrfs_subpage structure for alloc_extent_buffer()
>>>>    We don't want to call memory allocation with spinlock hold, so
>>>>    do preallocation before we acquire mapping->private_lock.
>>>>
>>>> - Handle subpage and regular case differently in
>>>>    attach_extent_buffer_page()
>>>>    For regular case, just do the usual thing.
>>>>    For subpage case, allocate new memory or use the preallocated 
>>>> memory.
>>>>
>>>> For future subpage metadata, we will make more usage of radix tree to
>>>> grab extnet buffer.
>>>>
>>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>>> ---
>>>>   fs/btrfs/extent_io.c | 75 
>>>> ++++++++++++++++++++++++++++++++++++++------
>>>>   fs/btrfs/subpage.h   | 17 ++++++++++
>>>>   2 files changed, 82 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>>> index a816ba4a8537..320731487ac0 100644
>>>> --- a/fs/btrfs/extent_io.c
>>>> +++ b/fs/btrfs/extent_io.c
>>>> @@ -24,6 +24,7 @@
>>>>   #include "rcu-string.h"
>>>>   #include "backref.h"
>>>>   #include "disk-io.h"
>>>> +#include "subpage.h"
>>>>   static struct kmem_cache *extent_state_cache;
>>>>   static struct kmem_cache *extent_buffer_cache;
>>>> @@ -3140,9 +3141,13 @@ static int submit_extent_page(unsigned int opf,
>>>>       return ret;
>>>>   }
>>>> -static void attach_extent_buffer_page(struct extent_buffer *eb,
>>>> -                      struct page *page)
>>>> +static int attach_extent_buffer_page(struct extent_buffer *eb,
>>>> +                      struct page *page,
>>>> +                      struct btrfs_subpage *prealloc)
>>>>   {
>>>> +    struct btrfs_fs_info *fs_info = eb->fs_info;
>>>> +    int ret;
>>>
>>> int ret = 0;
>>>
>>>> +
>>>>       /*
>>>>        * If the page is mapped to btree inode, we should hold the 
>>>> private
>>>>        * lock to prevent race.
>>>> @@ -3152,10 +3157,32 @@ static void attach_extent_buffer_page(struct 
>>>> extent_buffer *eb,
>>>>       if (page->mapping)
>>>>           lockdep_assert_held(&page->mapping->private_lock);
>>>> -    if (!PagePrivate(page))
>>>> -        attach_page_private(page, eb);
>>>> -    else
>>>> -        WARN_ON(page->private != (unsigned long)eb);
>>>> +    if (fs_info->sectorsize == PAGE_SIZE) {
>>>> +        if (!PagePrivate(page))
>>>> +            attach_page_private(page, eb);
>>>> +        else
>>>> +            WARN_ON(page->private != (unsigned long)eb);
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    /* Already mapped, just free prealloc */
>>>> +    if (PagePrivate(page)) {
>>>> +        kfree(prealloc);
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    if (prealloc) {
>>>> +        /* Has preallocated memory for subpage */
>>>> +        spin_lock_init(&prealloc->lock);
>>>> +        attach_page_private(page, prealloc);
>>>> +    } else {
>>>> +        /* Do new allocation to attach subpage */
>>>> +        ret = btrfs_attach_subpage(fs_info, page);
>>>> +        if (ret < 0)
>>>> +            return ret;
>>>
>>> Delete the above 2 lines.
>>>
>>>> +    }
>>>> +
>>>> +    return 0;
>>>
>>> return ret;
>>>
>>>>   }
>>>>   void set_page_extent_mapped(struct page *page)
>>>> @@ -5062,21 +5089,29 @@ struct extent_buffer 
>>>> *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>>>       if (new == NULL)
>>>>           return NULL;
>>>> +    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
>>>> +    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>>>> +
>>>
>>> Why are you doing this here?  It seems unrelated?  Looking at the 
>>> code it appears there's a reason for this later, but I had to go look 
>>> to make sure I wasn't crazy, so at the very least it needs to be done 
>>> in a more relevant patch.
>>
>> This is to handle case where we allocated a page but failed to 
>> allocate subpage structure.
>>
>> In that case, btrfs_release_extent_buffer() will go different routine 
>> to free the eb.
>>
>> Without UNMAPPED bit, it just go wrong without knowing it's a unmapped 
>> eb.
>>
>> This change is mostly due to the extra failure pattern introduced by 
>> the subpage memory allocation.
>>
> 
> Yes, but my point is it's unrelated to this change, and in fact the 
> problem exists outside of your changes, so it needs to be addressed in 
> its own patch with its own changelog.

OK, that makes sense.

But it needs be determined after determining how to handle dummy extent 
buffer first.
> 
>>>
>>>>       for (i = 0; i < num_pages; i++) {
>>>> +        int ret;
>>>> +
>>>>           p = alloc_page(GFP_NOFS);
>>>>           if (!p) {
>>>>               btrfs_release_extent_buffer(new);
>>>>               return NULL;
>>>>           }
>>>> -        attach_extent_buffer_page(new, p);
>>>> +        ret = attach_extent_buffer_page(new, p, NULL);
>>>> +        if (ret < 0) {
>>>> +            put_page(p);
>>>> +            btrfs_release_extent_buffer(new);
>>>> +            return NULL;
>>>> +        }
>>>>           WARN_ON(PageDirty(p));
>>>>           SetPageUptodate(p);
>>>>           new->pages[i] = p;
>>>>           copy_page(page_address(p), page_address(src->pages[i]));
>>>>       }
>>>> -    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
>>>> -    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>>>>       return new;
>>>>   }
>>>> @@ -5308,12 +5343,28 @@ struct extent_buffer 
>>>> *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>>>       num_pages = num_extent_pages(eb);
>>>>       for (i = 0; i < num_pages; i++, index++) {
>>>> +        struct btrfs_subpage *prealloc = NULL;
>>>> +
>>>>           p = find_or_create_page(mapping, index, 
>>>> GFP_NOFS|__GFP_NOFAIL);
>>>>           if (!p) {
>>>>               exists = ERR_PTR(-ENOMEM);
>>>>               goto free_eb;
>>>>           }
>>>> +        /*
>>>> +         * Preallocate page->private for subpage case, so that
>>>> +         * we won't allocate memory with private_lock hold.
>>>> +         * The memory will be freed by attach_extent_buffer_page() or
>>>> +         * freed manually if exit earlier.
>>>> +         */
>>>> +        ret = btrfs_alloc_subpage(fs_info, &prealloc);
>>>> +        if (ret < 0) {
>>>> +            unlock_page(p);
>>>> +            put_page(p);
>>>> +            exists = ERR_PTR(ret);
>>>> +            goto free_eb;
>>>> +        }
>>>> +
>>>
>>> I realize that for subpage sectorsize we'll only have 1 page, but I'd 
>>> still rather see this outside of the for loop, just for clarity sake.
>>
>> This is the trade-off.
>> Either we do every separately, sharing the minimal amount of code (and 
>> need extra for loop for future 16K pages), or using the same loop 
>> sacrifice a little readability.
>>
>> Here I'd say sharing more code is not that a big deal.
>>
> 
> It's not a tradeoff, it's confusing.  What I'm suggesting is you do
> 
> ret = btrfs_alloc_subpage(fs_info, &prealloc);
> if (ret) {
>      exists = ERR_PTR(ret);
>      goto free_eb;
> }
> for (i = 0; i < num_pages; i++, index++) {
> }

This means for later 16K page support, we still need to move 
btrfs_alloc_subpage() into the loop.

But I totally understand your point here.

I'd put a comment here explaining why we can just allocate one subpage 
structure here.

Thanks,
Qu

> 
> free_eb:
>      kmem_cache_free(prealloc);
> 
> The subpage portion is part of the eb itself, and there's one per eb, 
> and thus should be pre-allocated outside of the loop that is doing the 
> page lookup, as it's logically a different thing.  Thanks,
> 
> Josef
>
Qu Wenruo Jan. 26, 2021, 7:29 a.m. UTC | #7
On 2021/1/20 上午6:35, David Sterba wrote:
> On Tue, Jan 19, 2021 at 04:54:28PM -0500, Josef Bacik wrote:
>> On 1/16/21 2:15 AM, Qu Wenruo wrote:
>>> +/* For rare cases where we need to pre-allocate a btrfs_subpage structure */
>>> +static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
>>> +				      struct btrfs_subpage **ret)
>>> +{
>>> +	if (fs_info->sectorsize == PAGE_SIZE)
>>> +		return 0;
>>> +
>>> +	*ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
>>> +	if (!*ret)
>>> +		return -ENOMEM;
>>> +	return 0;
>>> +}
>>
>> We're allocating these for every metadata page, that deserves a dedicated
>> kmem_cache.  Thanks,
>
> I'm not opposed to that idea but for the first implementation I'm ok
> with using the default slabs. As the subpage support depends on the
> filesystem, creating the cache unconditionally would waste resources and
> creating it on demand would need some care. Either way I'd rather see it
> in a separate patch.
>
Well, too late for me to see this comment...

As I have already converted to kmem cache.

But the good news is, the latest version has extra refactor on memory
allocation/freeing, now we just need to change two lines to change how
we allocate memory for subpage.
(Although still need to remove the cache allocation code).

Will convert it back to default slab, but will also keep the refactor
there to make it easier for later convert to kmem_cache.

So don't be too surprised to see function like in next version.

   btrfs_free_subpage(struct btrfs_subpage *subpage)
   {
	kfree(subpage);
   }

Thanks,
Qu
David Sterba Jan. 27, 2021, 7:58 p.m. UTC | #8
On Tue, Jan 26, 2021 at 03:29:17PM +0800, Qu Wenruo wrote:
> On 2021/1/20 上午6:35, David Sterba wrote:
> > On Tue, Jan 19, 2021 at 04:54:28PM -0500, Josef Bacik wrote:
> >> On 1/16/21 2:15 AM, Qu Wenruo wrote:
> >>> +/* For rare cases where we need to pre-allocate a btrfs_subpage structure */
> >>> +static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
> >>> +				      struct btrfs_subpage **ret)
> >>> +{
> >>> +	if (fs_info->sectorsize == PAGE_SIZE)
> >>> +		return 0;
> >>> +
> >>> +	*ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
> >>> +	if (!*ret)
> >>> +		return -ENOMEM;
> >>> +	return 0;
> >>> +}
> >>
> >> We're allocating these for every metadata page, that deserves a dedicated
> >> kmem_cache.  Thanks,
> >
> > I'm not opposed to that idea but for the first implementation I'm ok
> > with using the default slabs. As the subpage support depends on the
> > filesystem, creating the cache unconditionally would waste resources and
> > creating it on demand would need some care. Either way I'd rather see it
> > in a separate patch.
> >
> Well, too late for me to see this comment...
> 
> As I have already converted to kmem cache.
> 
> But the good news is, the latest version has extra refactor on memory
> allocation/freeing, now we just need to change two lines to change how
> we allocate memory for subpage.
> (Although still need to remove the cache allocation code).
> 
> Will convert it back to default slab, but will also keep the refactor
> there to make it easier for later convert to kmem_cache.
> 
> So don't be too surprised to see function like in next version.
> 
>    btrfs_free_subpage(struct btrfs_subpage *subpage)
>    {
> 	kfree(subpage);
>    }

I hoped to save you time converting it to the kmem slabs so no need to
revert it back to kmalloc, keep what you have. Switching with the helper
would be easier should we need to reconsider it for some reason.
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a816ba4a8537..320731487ac0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -24,6 +24,7 @@ 
 #include "rcu-string.h"
 #include "backref.h"
 #include "disk-io.h"
+#include "subpage.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -3140,9 +3141,13 @@  static int submit_extent_page(unsigned int opf,
 	return ret;
 }
 
-static void attach_extent_buffer_page(struct extent_buffer *eb,
-				      struct page *page)
+static int attach_extent_buffer_page(struct extent_buffer *eb,
+				      struct page *page,
+				      struct btrfs_subpage *prealloc)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
+	int ret;
+
 	/*
 	 * If the page is mapped to btree inode, we should hold the private
 	 * lock to prevent race.
@@ -3152,10 +3157,32 @@  static void attach_extent_buffer_page(struct extent_buffer *eb,
 	if (page->mapping)
 		lockdep_assert_held(&page->mapping->private_lock);
 
-	if (!PagePrivate(page))
-		attach_page_private(page, eb);
-	else
-		WARN_ON(page->private != (unsigned long)eb);
+	if (fs_info->sectorsize == PAGE_SIZE) {
+		if (!PagePrivate(page))
+			attach_page_private(page, eb);
+		else
+			WARN_ON(page->private != (unsigned long)eb);
+		return 0;
+	}
+
+	/* Already mapped, just free prealloc */
+	if (PagePrivate(page)) {
+		kfree(prealloc);
+		return 0;
+	}
+
+	if (prealloc) {
+		/* Has preallocated memory for subpage */
+		spin_lock_init(&prealloc->lock);
+		attach_page_private(page, prealloc);
+	} else {
+		/* Do new allocation to attach subpage */
+		ret = btrfs_attach_subpage(fs_info, page);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
 }
 
 void set_page_extent_mapped(struct page *page)
@@ -5062,21 +5089,29 @@  struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 	if (new == NULL)
 		return NULL;
 
+	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+
 	for (i = 0; i < num_pages; i++) {
+		int ret;
+
 		p = alloc_page(GFP_NOFS);
 		if (!p) {
 			btrfs_release_extent_buffer(new);
 			return NULL;
 		}
-		attach_extent_buffer_page(new, p);
+		ret = attach_extent_buffer_page(new, p, NULL);
+		if (ret < 0) {
+			put_page(p);
+			btrfs_release_extent_buffer(new);
+			return NULL;
+		}
 		WARN_ON(PageDirty(p));
 		SetPageUptodate(p);
 		new->pages[i] = p;
 		copy_page(page_address(p), page_address(src->pages[i]));
 	}
 
-	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
-	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
 
 	return new;
 }
@@ -5308,12 +5343,28 @@  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	num_pages = num_extent_pages(eb);
 	for (i = 0; i < num_pages; i++, index++) {
+		struct btrfs_subpage *prealloc = NULL;
+
 		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
 		if (!p) {
 			exists = ERR_PTR(-ENOMEM);
 			goto free_eb;
 		}
 
+		/*
+		 * Preallocate page->private for subpage case, so that
+		 * we won't allocate memory with private_lock hold.
+		 * The memory will be freed by attach_extent_buffer_page() or
+		 * freed manually if exit earlier.
+		 */
+		ret = btrfs_alloc_subpage(fs_info, &prealloc);
+		if (ret < 0) {
+			unlock_page(p);
+			put_page(p);
+			exists = ERR_PTR(ret);
+			goto free_eb;
+		}
+
 		spin_lock(&mapping->private_lock);
 		exists = grab_extent_buffer(p);
 		if (exists) {
@@ -5321,10 +5372,14 @@  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 			unlock_page(p);
 			put_page(p);
 			mark_extent_buffer_accessed(exists, p);
+			kfree(prealloc);
 			goto free_eb;
 		}
-		attach_extent_buffer_page(eb, p);
+		/* Should not fail, as we have preallocated the memory */
+		ret = attach_extent_buffer_page(eb, p, prealloc);
+		ASSERT(!ret);
 		spin_unlock(&mapping->private_lock);
+
 		WARN_ON(PageDirty(p));
 		eb->pages[i] = p;
 		if (!PageUptodate(p))
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 96f3b226913e..f701256dd1e2 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -23,8 +23,25 @@ 
 struct btrfs_subpage {
 	/* Common members for both data and metadata pages */
 	spinlock_t lock;
+	union {
+		/* Structures only used by metadata */
+		/* Structures only used by data */
+	};
 };
 
+/* For rare cases where we need to pre-allocate a btrfs_subpage structure */
+static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
+				      struct btrfs_subpage **ret)
+{
+	if (fs_info->sectorsize == PAGE_SIZE)
+		return 0;
+
+	*ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
+	if (!*ret)
+		return -ENOMEM;
+	return 0;
+}
+
 int btrfs_attach_subpage(struct btrfs_fs_info *fs_info, struct page *page);
 void btrfs_detach_subpage(struct btrfs_fs_info *fs_info, struct page *page);