diff mbox series

[v4,12/18] btrfs: implement try_release_extent_buffer() for subpage metadata support

Message ID 20210116071533.105780-13-wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: add read-only support for subpage sector size | expand

Commit Message

Qu Wenruo Jan. 16, 2021, 7:15 a.m. UTC
Unlike the original try_release_extent_buffer(),
try_release_subpage_extent_buffer() will iterate through all the ebs in
the page, and try to release each eb.

And only if the page and no private attached, which implies we have
released all ebs of the page, then we can release the full page.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/extent_io.c | 106 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 104 insertions(+), 2 deletions(-)

Comments

Josef Bacik Jan. 20, 2021, 3:05 p.m. UTC | #1
On 1/16/21 2:15 AM, Qu Wenruo wrote:
> Unlike the original try_release_extent_buffer(),
> try_release_subpage_extent_buffer() will iterate through all the ebs in
> the page, and try to release each eb.
> 
> And only if the page and no private attached, which implies we have
> released all ebs of the page, then we can release the full page.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>   fs/btrfs/extent_io.c | 106 ++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 104 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 74a37eec921f..9414219fa28b 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -6335,13 +6335,115 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
>   	}
>   }
>   
> +static struct extent_buffer *get_next_extent_buffer(
> +		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
> +{
> +	struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
> +	struct extent_buffer *found = NULL;
> +	u64 page_start = page_offset(page);
> +	int ret;
> +	int i;
> +
> +	ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
> +	ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
> +	lockdep_assert_held(&fs_info->buffer_lock);
> +
> +	ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
> +			bytenr >> fs_info->sectorsize_bits,
> +			PAGE_SIZE / fs_info->nodesize);
> +	for (i = 0; i < ret; i++) {
> +		/* Already beyond page end */
> +		if (gang[i]->start >= page_start + PAGE_SIZE)
> +			break;
> +		/* Found one */
> +		if (gang[i]->start >= bytenr) {
> +			found = gang[i];
> +			break;
> +		}
> +	}
> +	return found;
> +}
> +
> +static int try_release_subpage_extent_buffer(struct page *page)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
> +	u64 cur = page_offset(page);
> +	const u64 end = page_offset(page) + PAGE_SIZE;
> +	int ret;
> +
> +	while (cur < end) {
> +		struct extent_buffer *eb = NULL;
> +
> +		/*
> +		 * Unlike try_release_extent_buffer() which uses page->private
> +		 * to grab buffer, for subpage case we rely on radix tree, thus
> +		 * we need to ensure radix tree consistency.
> +		 *
> +		 * We also want an atomic snapshot of the radix tree, thus go
> +		 * spinlock other than RCU.
> +		 */
> +		spin_lock(&fs_info->buffer_lock);
> +		eb = get_next_extent_buffer(fs_info, page, cur);
> +		if (!eb) {
> +			/* No more eb in the page range after or at @cur */
> +			spin_unlock(&fs_info->buffer_lock);
> +			break;
> +		}
> +		cur = eb->start + eb->len;
> +
> +		/*
> +		 * The same as try_release_extent_buffer(), to ensure the eb
> +		 * won't disappear out from under us.
> +		 */
> +		spin_lock(&eb->refs_lock);
> +		if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
> +			spin_unlock(&eb->refs_lock);
> +			spin_unlock(&fs_info->buffer_lock);

Why continue at this point?  We know we can't drop this thing, break here.

<snip>

> +}
> +
>   int try_release_extent_buffer(struct page *page)
>   {
>   	struct extent_buffer *eb;
>   
> +	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
> +		return try_release_subpage_extent_buffer(page);

You're using sectorsize again here.  I realize the problem is sectorsize != 
PAGE_SIZE, but sectorsize != nodesize all the time, so please change all of the 
patches to check the actual relevant size for the data/metadata type.  Thanks,

Josef
Qu Wenruo Jan. 21, 2021, 12:51 a.m. UTC | #2
On 2021/1/20 下午11:05, Josef Bacik wrote:
> On 1/16/21 2:15 AM, Qu Wenruo wrote:
>> Unlike the original try_release_extent_buffer(),
>> try_release_subpage_extent_buffer() will iterate through all the ebs in
>> the page, and try to release each eb.
>>
>> And only if the page and no private attached, which implies we have
>> released all ebs of the page, then we can release the full page.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/extent_io.c | 106 ++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 104 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 74a37eec921f..9414219fa28b 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -6335,13 +6335,115 @@ void memmove_extent_buffer(const struct
>> extent_buffer *dst,
>>       }
>>   }
>> +static struct extent_buffer *get_next_extent_buffer(
>> +        struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
>> +{
>> +    struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
>> +    struct extent_buffer *found = NULL;
>> +    u64 page_start = page_offset(page);
>> +    int ret;
>> +    int i;
>> +
>> +    ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
>> +    ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
>> +    lockdep_assert_held(&fs_info->buffer_lock);
>> +
>> +    ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
>> +            bytenr >> fs_info->sectorsize_bits,
>> +            PAGE_SIZE / fs_info->nodesize);
>> +    for (i = 0; i < ret; i++) {
>> +        /* Already beyond page end */
>> +        if (gang[i]->start >= page_start + PAGE_SIZE)
>> +            break;
>> +        /* Found one */
>> +        if (gang[i]->start >= bytenr) {
>> +            found = gang[i];
>> +            break;
>> +        }
>> +    }
>> +    return found;
>> +}
>> +
>> +static int try_release_subpage_extent_buffer(struct page *page)
>> +{
>> +    struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
>> +    u64 cur = page_offset(page);
>> +    const u64 end = page_offset(page) + PAGE_SIZE;
>> +    int ret;
>> +
>> +    while (cur < end) {
>> +        struct extent_buffer *eb = NULL;
>> +
>> +        /*
>> +         * Unlike try_release_extent_buffer() which uses page->private
>> +         * to grab buffer, for subpage case we rely on radix tree, thus
>> +         * we need to ensure radix tree consistency.
>> +         *
>> +         * We also want an atomic snapshot of the radix tree, thus go
>> +         * spinlock other than RCU.
>> +         */
>> +        spin_lock(&fs_info->buffer_lock);
>> +        eb = get_next_extent_buffer(fs_info, page, cur);
>> +        if (!eb) {
>> +            /* No more eb in the page range after or at @cur */
>> +            spin_unlock(&fs_info->buffer_lock);
>> +            break;
>> +        }
>> +        cur = eb->start + eb->len;
>> +
>> +        /*
>> +         * The same as try_release_extent_buffer(), to ensure the eb
>> +         * won't disappear out from under us.
>> +         */
>> +        spin_lock(&eb->refs_lock);
>> +        if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
>> +            spin_unlock(&eb->refs_lock);
>> +            spin_unlock(&fs_info->buffer_lock);
>
> Why continue at this point?  We know we can't drop this thing, break here.
>
> <snip>
>
>> +}
>> +
>>   int try_release_extent_buffer(struct page *page)
>>   {
>>       struct extent_buffer *eb;
>> +    if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
>> +        return try_release_subpage_extent_buffer(page);
>
> You're using sectorsize again here.  I realize the problem is sectorsize
> != PAGE_SIZE, but sectorsize != nodesize all the time, so please change
> all of the patches to check the actual relevant size for the
> data/metadata type.  Thanks,

Again, nodesize >= sectorsize is the requirement for current mkfs.

As sectorsize is the minimal unit for both data and metadata.
(We don't have separate data unit size and metadata unit size, they
share sector size for now).

Thanks,
Qu

>
> Josef
David Sterba Jan. 23, 2021, 8:36 p.m. UTC | #3
On Wed, Jan 20, 2021 at 10:05:56AM -0500, Josef Bacik wrote:
> On 1/16/21 2:15 AM, Qu Wenruo wrote:
> >   int try_release_extent_buffer(struct page *page)
> >   {
> >   	struct extent_buffer *eb;
> >   
> > +	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
> > +		return try_release_subpage_extent_buffer(page);
> 
> You're using sectorsize again here.  I realize the problem is sectorsize != 
> PAGE_SIZE, but sectorsize != nodesize all the time, so please change all of the 
> patches to check the actual relevant size for the data/metadata type.  Thanks,

We had a long discussion with Qu about that on slack some time ago.
Right now we have sectorsize defining the data block size and also the
metadata unit size and check that as a constraint.

This is not perfect and does not cover all possible page/data/metadata
block size combinations and in the code looks odd, like in
scrub_checksum_tree_block, see the comment.

Adding the subpage support is quite intrusive and we can't cover all
size combinations at the same time so we agreed on first iteration where
sectorsize is still used as the nodesize constraint. This allows to test
the new code and the whole subpage infrastructure on real hw like arm64
or ppc64.

That we'll need to decouple sectorsize from the metadata won't be
forgotten, I've agreed with that conditionally and given how many
patches are floating around it'll become even harder to move forward
with the patches.
Josef Bacik Jan. 25, 2021, 8:02 p.m. UTC | #4
On 1/23/21 3:36 PM, David Sterba wrote:
> On Wed, Jan 20, 2021 at 10:05:56AM -0500, Josef Bacik wrote:
>> On 1/16/21 2:15 AM, Qu Wenruo wrote:
>>>    int try_release_extent_buffer(struct page *page)
>>>    {
>>>    	struct extent_buffer *eb;
>>>    
>>> +	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
>>> +		return try_release_subpage_extent_buffer(page);
>>
>> You're using sectorsize again here.  I realize the problem is sectorsize !=
>> PAGE_SIZE, but sectorsize != nodesize all the time, so please change all of the
>> patches to check the actual relevant size for the data/metadata type.  Thanks,
> 
> We had a long discussion with Qu about that on slack some time ago.
> Right now we have sectorsize defining the data block size and also the
> metadata unit size and check that as a constraint.
> 
> This is not perfect and does not cover all possible page/data/metadata
> block size combinations and in the code looks odd, like in
> scrub_checksum_tree_block, see the comment.
> 
> Adding the subpage support is quite intrusive and we can't cover all
> size combinations at the same time so we agreed on first iteration where
> sectorsize is still used as the nodesize constraint. This allows to test
> the new code and the whole subpage infrastructure on real hw like arm64
> or ppc64.
> 
> That we'll need to decouple sectorsize from the metadata won't be
> forgotten, I've agreed with that conditionally and given how many
> patches are floating around it'll become even harder to move forward
> with the patches.
> 

Alright that's fine, I'll ignore the weird mismatch'ed checks for now.  Thanks,

Josef
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 74a37eec921f..9414219fa28b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -6335,13 +6335,115 @@  void memmove_extent_buffer(const struct extent_buffer *dst,
 	}
 }
 
+static struct extent_buffer *get_next_extent_buffer(
+		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+	struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+	struct extent_buffer *found = NULL;
+	u64 page_start = page_offset(page);
+	int ret;
+	int i;
+
+	ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
+	ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
+	lockdep_assert_held(&fs_info->buffer_lock);
+
+	ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
+			bytenr >> fs_info->sectorsize_bits,
+			PAGE_SIZE / fs_info->nodesize);
+	for (i = 0; i < ret; i++) {
+		/* Already beyond page end */
+		if (gang[i]->start >= page_start + PAGE_SIZE)
+			break;
+		/* Found one */
+		if (gang[i]->start >= bytenr) {
+			found = gang[i];
+			break;
+		}
+	}
+	return found;
+}
+
+static int try_release_subpage_extent_buffer(struct page *page)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	u64 cur = page_offset(page);
+	const u64 end = page_offset(page) + PAGE_SIZE;
+	int ret;
+
+	while (cur < end) {
+		struct extent_buffer *eb = NULL;
+
+		/*
+		 * Unlike try_release_extent_buffer() which uses page->private
+		 * to grab buffer, for subpage case we rely on radix tree, thus
+		 * we need to ensure radix tree consistency.
+		 *
+		 * We also want an atomic snapshot of the radix tree, thus go
+		 * spinlock other than RCU.
+		 */
+		spin_lock(&fs_info->buffer_lock);
+		eb = get_next_extent_buffer(fs_info, page, cur);
+		if (!eb) {
+			/* No more eb in the page range after or at @cur */
+			spin_unlock(&fs_info->buffer_lock);
+			break;
+		}
+		cur = eb->start + eb->len;
+
+		/*
+		 * The same as try_release_extent_buffer(), to ensure the eb
+		 * won't disappear out from under us.
+		 */
+		spin_lock(&eb->refs_lock);
+		if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+			spin_unlock(&eb->refs_lock);
+			spin_unlock(&fs_info->buffer_lock);
+			continue;
+		}
+		spin_unlock(&fs_info->buffer_lock);
+
+		/*
+		 * If tree ref isn't set then we know the ref on this eb is a
+		 * real ref, so just return, this eb will likely be freed soon
+		 * anyway.
+		 */
+		if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+			spin_unlock(&eb->refs_lock);
+			continue;
+		}
+
+		/*
+		 * Here we don't care the return value, we will always check
+		 * the page private at the end.
+		 * And release_extent_buffer() will release the refs_lock.
+		 */
+		release_extent_buffer(eb);
+	}
+	/*
+	 * Finally to check if we have cleared page private, as if we have
+	 * released all ebs in the page, the page private should be cleared now.
+	 */
+	spin_lock(&page->mapping->private_lock);
+	if (!PagePrivate(page))
+		ret = 1;
+	else
+		ret = 0;
+	spin_unlock(&page->mapping->private_lock);
+	return ret;
+
+}
+
 int try_release_extent_buffer(struct page *page)
 {
 	struct extent_buffer *eb;
 
+	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+		return try_release_subpage_extent_buffer(page);
+
 	/*
-	 * We need to make sure nobody is attaching this page to an eb right
-	 * now.
+	 * We need to make sure nobody is change page->private, as we rely on
+	 * page->private as the pointer to extent buffer.
 	 */
 	spin_lock(&page->mapping->private_lock);
 	if (!PagePrivate(page)) {