diff mbox series

[v5,06/18] btrfs: support subpage for extent buffer page release

Message ID 20210126083402.142577-7-wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: add read-only support for subpage sector size | expand

Commit Message

Qu Wenruo Jan. 26, 2021, 8:33 a.m. UTC
In btrfs_release_extent_buffer_pages(), we need to add extra handling
for subpage.

Introduce a helper, detach_extent_buffer_page(), to do different
handling for regular and subpage cases.

For subpage case, handle detaching page private.

For unmapped (dummy or cloned) ebs, we can detach the page private
immediately as the page can only be attached to one unmapped eb.

For mapped ebs, we have to ensure there are no eb in the page range
before we delete it, as page->private is shared between all ebs in the
same page.

But there is a subpage specific race, where we can race with extent
buffer allocation, and clear the page private while new eb is still
being utilized, like this:

  Extent buffer A is the new extent buffer which will be allocated,
  while extent buffer B is the last existing extent buffer of the page.

  		T1 (eb A) 	 |		T2 (eb B)
  -------------------------------+------------------------------
  alloc_extent_buffer()		 | btrfs_release_extent_buffer_pages()
  |- p = find_or_create_page()   | |
  |- attach_extent_buffer_page() | |
  |				 | |- detach_extent_buffer_page()
  |				 |    |- if (!page_range_has_eb())
  |				 |    |  No new eb in the page range yet
  |				 |    |  As new eb A hasn't yet been
  |				 |    |  inserted into radix tree.
  |				 |    |- btrfs_detach_subpage()
  |				 |       |- detach_page_private();
  |- radix_tree_insert()	 |

  Then we have a metadata eb whose page has no private bit.

To avoid such race, we introduce a subpage metadata-specific member,
btrfs_subpage::eb_refs.

In alloc_extent_buffer() we increase eb_refs in the critical section of
private_lock.  Then page_range_has_eb() will return true for
detach_extent_buffer_page(), and will not detach page private.

The section is marked by:

- btrfs_page_inc_eb_refs()
- btrfs_page_dec_eb_refs()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 94 +++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/subpage.c   | 42 ++++++++++++++++++++
 fs/btrfs/subpage.h   | 13 +++++-
 3 files changed, 133 insertions(+), 16 deletions(-)

Comments

Josef Bacik Jan. 27, 2021, 4:21 p.m. UTC | #1
On 1/26/21 3:33 AM, Qu Wenruo wrote:
> In btrfs_release_extent_buffer_pages(), we need to add extra handling
> for subpage.
> 
> Introduce a helper, detach_extent_buffer_page(), to do different
> handling for regular and subpage cases.
> 
> For subpage case, handle detaching page private.
> 
> For unmapped (dummy or cloned) ebs, we can detach the page private
> immediately as the page can only be attached to one unmapped eb.
> 
> For mapped ebs, we have to ensure there are no eb in the page range
> before we delete it, as page->private is shared between all ebs in the
> same page.
> 
> But there is a subpage specific race, where we can race with extent
> buffer allocation, and clear the page private while new eb is still
> being utilized, like this:
> 
>    Extent buffer A is the new extent buffer which will be allocated,
>    while extent buffer B is the last existing extent buffer of the page.
> 
>    		T1 (eb A) 	 |		T2 (eb B)
>    -------------------------------+------------------------------
>    alloc_extent_buffer()		 | btrfs_release_extent_buffer_pages()
>    |- p = find_or_create_page()   | |
>    |- attach_extent_buffer_page() | |
>    |				 | |- detach_extent_buffer_page()
>    |				 |    |- if (!page_range_has_eb())
>    |				 |    |  No new eb in the page range yet
>    |				 |    |  As new eb A hasn't yet been
>    |				 |    |  inserted into radix tree.
>    |				 |    |- btrfs_detach_subpage()
>    |				 |       |- detach_page_private();
>    |- radix_tree_insert()	 |
> 
>    Then we have a metadata eb whose page has no private bit.
> 
> To avoid such race, we introduce a subpage metadata-specific member,
> btrfs_subpage::eb_refs.
> 
> In alloc_extent_buffer() we increase eb_refs in the critical section of
> private_lock.  Then page_range_has_eb() will return true for
> detach_extent_buffer_page(), and will not detach page private.
> 
> The section is marked by:
> 
> - btrfs_page_inc_eb_refs()
> - btrfs_page_dec_eb_refs()
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> Reviewed-by: David Sterba <dsterba@suse.com>
> Signed-off-by: David Sterba <dsterba@suse.com>
> ---
>   fs/btrfs/extent_io.c | 94 +++++++++++++++++++++++++++++++++++++-------
>   fs/btrfs/subpage.c   | 42 ++++++++++++++++++++
>   fs/btrfs/subpage.h   | 13 +++++-
>   3 files changed, 133 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 16a29f63cfd1..118874926179 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -4993,25 +4993,39 @@ int extent_buffer_under_io(const struct extent_buffer *eb)
>   		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
>   }
>   
> -/*
> - * Release all pages attached to the extent buffer.
> - */
> -static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
> +static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
>   {
> -	int i;
> -	int num_pages;
> -	int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
> +	struct btrfs_subpage *subpage;
>   
> -	BUG_ON(extent_buffer_under_io(eb));
> +	lockdep_assert_held(&page->mapping->private_lock);
>   
> -	num_pages = num_extent_pages(eb);
> -	for (i = 0; i < num_pages; i++) {
> -		struct page *page = eb->pages[i];
> +	if (PagePrivate(page)) {
> +		subpage = (struct btrfs_subpage *)page->private;
> +		if (atomic_read(&subpage->eb_refs))
> +			return true;
> +	}
> +	return false;
> +}
>   
> -		if (!page)
> -			continue;
> +static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
> +{
> +	struct btrfs_fs_info *fs_info = eb->fs_info;
> +	const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
> +
> +	/*
> +	 * For mapped eb, we're going to change the page private, which should
> +	 * be done under the private_lock.
> +	 */
> +	if (mapped)
> +		spin_lock(&page->mapping->private_lock);
> +
> +	if (!PagePrivate(page)) {
>   		if (mapped)
> -			spin_lock(&page->mapping->private_lock);
> +			spin_unlock(&page->mapping->private_lock);
> +		return;
> +	}
> +
> +	if (fs_info->sectorsize == PAGE_SIZE) {
>   		/*
>   		 * We do this since we'll remove the pages after we've
>   		 * removed the eb from the radix tree, so we could race
> @@ -5030,9 +5044,49 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
>   			 */
>   			detach_page_private(page);
>   		}
> -
>   		if (mapped)
>   			spin_unlock(&page->mapping->private_lock);
> +		return;
> +	}
> +
> +	/*
> +	 * For subpage, we can have dummy eb with page private.  In this case,
> +	 * we can directly detach the private as such page is only attached to
> +	 * one dummy eb, no sharing.
> +	 */
> +	if (!mapped) {
> +		btrfs_detach_subpage(fs_info, page);
> +		return;
> +	}
> +
> +	btrfs_page_dec_eb_refs(fs_info, page);
> +
> +	/*
> +	 * We can only detach the page private if there are no other ebs in the
> +	 * page range.
> +	 */
> +	if (!page_range_has_eb(fs_info, page))
> +		btrfs_detach_subpage(fs_info, page);
> +
> +	spin_unlock(&page->mapping->private_lock);
> +}
> +
> +/* Release all pages attached to the extent buffer */
> +static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
> +{
> +	int i;
> +	int num_pages;
> +
> +	ASSERT(!extent_buffer_under_io(eb));
> +
> +	num_pages = num_extent_pages(eb);
> +	for (i = 0; i < num_pages; i++) {
> +		struct page *page = eb->pages[i];
> +
> +		if (!page)
> +			continue;
> +
> +		detach_extent_buffer_page(eb, page);
>   
>   		/* One for when we allocated the page */
>   		put_page(page);
> @@ -5392,6 +5446,16 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>   		/* Should not fail, as we have preallocated the memory */
>   		ret = attach_extent_buffer_page(eb, p, prealloc);
>   		ASSERT(!ret);
> +		/*
> +		 * To inform we have extra eb under allocation, so that
> +		 * detach_extent_buffer_page() won't release the page private
> +		 * when the eb hasn't yet been inserted into radix tree.
> +		 *
> +		 * The ref will be decreased when the eb released the page, in
> +		 * detach_extent_buffer_page().
> +		 * Thus needs no special handling in error path.
> +		 */
> +		btrfs_page_inc_eb_refs(fs_info, p);
>   		spin_unlock(&mapping->private_lock);
>   
>   		WARN_ON(PageDirty(p));
> diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
> index 61b28dfca20c..a2a21fa0ea35 100644
> --- a/fs/btrfs/subpage.c
> +++ b/fs/btrfs/subpage.c
> @@ -52,6 +52,8 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
>   	if (!*ret)
>   		return -ENOMEM;
>   	spin_lock_init(&(*ret)->lock);
> +	if (type == BTRFS_SUBPAGE_METADATA)
> +		atomic_set(&(*ret)->eb_refs, 0);
>   	return 0;
>   }
>   
> @@ -59,3 +61,43 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
>   {
>   	kfree(subpage);
>   }
> +
> +/*
> + * Increase the eb_refs of current subpage.
> + *
> + * This is important for eb allocation, to prevent race with last eb freeing
> + * of the same page.
> + * With the eb_refs increased before the eb inserted into radix tree,
> + * detach_extent_buffer_page() won't detach the page private while we're still
> + * allocating the extent buffer.
> + */
> +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
> +			    struct page *page)
> +{
> +	struct btrfs_subpage *subpage;
> +
> +	if (fs_info->sectorsize == PAGE_SIZE)
> +		return;
> +
> +	ASSERT(PagePrivate(page) && page->mapping);
> +	lockdep_assert_held(&page->mapping->private_lock);
> +
> +	subpage = (struct btrfs_subpage *)page->private;
> +	atomic_inc(&subpage->eb_refs);
> +}
> +
> +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
> +			    struct page *page)
> +{
> +	struct btrfs_subpage *subpage;
> +
> +	if (fs_info->sectorsize == PAGE_SIZE)
> +		return;
> +
> +	ASSERT(PagePrivate(page) && page->mapping);
> +	lockdep_assert_held(&page->mapping->private_lock);
> +
> +	subpage = (struct btrfs_subpage *)page->private;
> +	ASSERT(atomic_read(&subpage->eb_refs));
> +	atomic_dec(&subpage->eb_refs);
> +}
> diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
> index 7ba544bcc9c6..eef2ecae77e0 100644
> --- a/fs/btrfs/subpage.h
> +++ b/fs/btrfs/subpage.h
> @@ -4,6 +4,7 @@
>   #define BTRFS_SUBPAGE_H
>   
>   #include <linux/spinlock.h>
> +#include <linux/refcount.h>

I made this comment elsewhere, but the patch finally showed up in my email after 
I refreshed (???? thunderbird wtf??).  Anyway you import refcount.h here, but 
don't actually use refcount_t.  Please use refcount_t, so we get the benefit of 
the debugging from the helpers.  Thanks,

Josef
David Sterba Feb. 1, 2021, 3:32 p.m. UTC | #2
On Wed, Jan 27, 2021 at 11:21:08AM -0500, Josef Bacik wrote:
> On 1/26/21 3:33 AM, Qu Wenruo wrote:
> > --- a/fs/btrfs/subpage.h
> > +++ b/fs/btrfs/subpage.h
> > @@ -4,6 +4,7 @@
> >   #define BTRFS_SUBPAGE_H
> >   
> >   #include <linux/spinlock.h>
> > +#include <linux/refcount.h>
> 
> I made this comment elsewhere, but the patch finally showed up in my email after 
> I refreshed (???? thunderbird wtf??).  Anyway you import refcount.h here, but 
> don't actually use refcount_t.  Please use refcount_t, so we get the benefit of 
> the debugging from the helpers.  Thanks,

Switching to refcount looks a bit complicated so for now let's use
atomics, it's affecting only the subpage case.
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 16a29f63cfd1..118874926179 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4993,25 +4993,39 @@  int extent_buffer_under_io(const struct extent_buffer *eb)
 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 }
 
-/*
- * Release all pages attached to the extent buffer.
- */
-static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
 {
-	int i;
-	int num_pages;
-	int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+	struct btrfs_subpage *subpage;
 
-	BUG_ON(extent_buffer_under_io(eb));
+	lockdep_assert_held(&page->mapping->private_lock);
 
-	num_pages = num_extent_pages(eb);
-	for (i = 0; i < num_pages; i++) {
-		struct page *page = eb->pages[i];
+	if (PagePrivate(page)) {
+		subpage = (struct btrfs_subpage *)page->private;
+		if (atomic_read(&subpage->eb_refs))
+			return true;
+	}
+	return false;
+}
 
-		if (!page)
-			continue;
+static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+{
+	struct btrfs_fs_info *fs_info = eb->fs_info;
+	const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+
+	/*
+	 * For mapped eb, we're going to change the page private, which should
+	 * be done under the private_lock.
+	 */
+	if (mapped)
+		spin_lock(&page->mapping->private_lock);
+
+	if (!PagePrivate(page)) {
 		if (mapped)
-			spin_lock(&page->mapping->private_lock);
+			spin_unlock(&page->mapping->private_lock);
+		return;
+	}
+
+	if (fs_info->sectorsize == PAGE_SIZE) {
 		/*
 		 * We do this since we'll remove the pages after we've
 		 * removed the eb from the radix tree, so we could race
@@ -5030,9 +5044,49 @@  static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
 			 */
 			detach_page_private(page);
 		}
-
 		if (mapped)
 			spin_unlock(&page->mapping->private_lock);
+		return;
+	}
+
+	/*
+	 * For subpage, we can have dummy eb with page private.  In this case,
+	 * we can directly detach the private as such page is only attached to
+	 * one dummy eb, no sharing.
+	 */
+	if (!mapped) {
+		btrfs_detach_subpage(fs_info, page);
+		return;
+	}
+
+	btrfs_page_dec_eb_refs(fs_info, page);
+
+	/*
+	 * We can only detach the page private if there are no other ebs in the
+	 * page range.
+	 */
+	if (!page_range_has_eb(fs_info, page))
+		btrfs_detach_subpage(fs_info, page);
+
+	spin_unlock(&page->mapping->private_lock);
+}
+
+/* Release all pages attached to the extent buffer */
+static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+{
+	int i;
+	int num_pages;
+
+	ASSERT(!extent_buffer_under_io(eb));
+
+	num_pages = num_extent_pages(eb);
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = eb->pages[i];
+
+		if (!page)
+			continue;
+
+		detach_extent_buffer_page(eb, page);
 
 		/* One for when we allocated the page */
 		put_page(page);
@@ -5392,6 +5446,16 @@  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		/* Should not fail, as we have preallocated the memory */
 		ret = attach_extent_buffer_page(eb, p, prealloc);
 		ASSERT(!ret);
+		/*
+		 * To inform we have extra eb under allocation, so that
+		 * detach_extent_buffer_page() won't release the page private
+		 * when the eb hasn't yet been inserted into radix tree.
+		 *
+		 * The ref will be decreased when the eb released the page, in
+		 * detach_extent_buffer_page().
+		 * Thus needs no special handling in error path.
+		 */
+		btrfs_page_inc_eb_refs(fs_info, p);
 		spin_unlock(&mapping->private_lock);
 
 		WARN_ON(PageDirty(p));
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 61b28dfca20c..a2a21fa0ea35 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -52,6 +52,8 @@  int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
 	if (!*ret)
 		return -ENOMEM;
 	spin_lock_init(&(*ret)->lock);
+	if (type == BTRFS_SUBPAGE_METADATA)
+		atomic_set(&(*ret)->eb_refs, 0);
 	return 0;
 }
 
@@ -59,3 +61,43 @@  void btrfs_free_subpage(struct btrfs_subpage *subpage)
 {
 	kfree(subpage);
 }
+
+/*
+ * Increase the eb_refs of current subpage.
+ *
+ * This is important for eb allocation, to prevent race with last eb freeing
+ * of the same page.
+ * With the eb_refs increased before the eb inserted into radix tree,
+ * detach_extent_buffer_page() won't detach the page private while we're still
+ * allocating the extent buffer.
+ */
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+			    struct page *page)
+{
+	struct btrfs_subpage *subpage;
+
+	if (fs_info->sectorsize == PAGE_SIZE)
+		return;
+
+	ASSERT(PagePrivate(page) && page->mapping);
+	lockdep_assert_held(&page->mapping->private_lock);
+
+	subpage = (struct btrfs_subpage *)page->private;
+	atomic_inc(&subpage->eb_refs);
+}
+
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+			    struct page *page)
+{
+	struct btrfs_subpage *subpage;
+
+	if (fs_info->sectorsize == PAGE_SIZE)
+		return;
+
+	ASSERT(PagePrivate(page) && page->mapping);
+	lockdep_assert_held(&page->mapping->private_lock);
+
+	subpage = (struct btrfs_subpage *)page->private;
+	ASSERT(atomic_read(&subpage->eb_refs));
+	atomic_dec(&subpage->eb_refs);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 7ba544bcc9c6..eef2ecae77e0 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -4,6 +4,7 @@ 
 #define BTRFS_SUBPAGE_H
 
 #include <linux/spinlock.h>
+#include <linux/refcount.h>
 
 /*
  * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
@@ -19,7 +20,13 @@  struct btrfs_subpage {
 	/* Common members for both data and metadata pages */
 	spinlock_t lock;
 	union {
-		/* Structures only used by metadata */
+		/*
+		 * Structures only used by metadata
+		 *
+		 * @eb_refs should only be operated under private_lock, as it
+		 * manages whether the subpage can be detached.
+		 */
+		atomic_t eb_refs;
 		/* Structures only used by data */
 	};
 };
@@ -40,4 +47,8 @@  int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
 			enum btrfs_subpage_type type);
 void btrfs_free_subpage(struct btrfs_subpage *subpage);
 
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+			    struct page *page);
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+			    struct page *page);
 #endif