diff mbox series

[2/2] btrfs: replace stripe extents

Message ID 20240709-b4-rst-updates-v1-2-200800dfe0fd@kernel.org (mailing list archive)
State New, archived
Headers show
Series btrfs: more RAID stripe tree updates | expand

Commit Message

Johannes Thumshirn July 9, 2024, 6:32 a.m. UTC
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>

Update stripe extents in case a write to an already existing address
incoming.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/raid-stripe-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

Comments

Qu Wenruo July 9, 2024, 7:18 a.m. UTC | #1
在 2024/7/9 16:02, Johannes Thumshirn 写道:
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>
> Update stripe extents in case a write to an already existing address
> incoming.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

Looks good to me.

Reviewed-by: Qu Wenruo <wqu@suse.com>

But still as I mentioned in the original thread, I'm wondering why
dev-replace of RST needs to update RST entry.

I'd prefer to do a dev-extent level copy so that no RST/chunk needs to
be updated, just like what we did for non-RST cases.

But so far the change should be good enough for us to continue the testing.

Thanks,
Qu

> ---
>   fs/btrfs/raid-stripe-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 51 insertions(+)
>
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index e6f7a234b8f6..fd56535b2289 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -73,6 +73,55 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
>   	return ret;
>   }
>
> +static int update_raid_extent_item(struct btrfs_trans_handle *trans,
> +				   struct btrfs_key *key,
> +				   struct btrfs_io_context *bioc)
> +{
> +	struct btrfs_path *path;
> +	struct extent_buffer *leaf;
> +	struct btrfs_stripe_extent *stripe_extent;
> +	int num_stripes;
> +	int ret;
> +	int slot;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
> +				0, 1);
> +	if (ret)
> +		return ret == 1 ? ret : -EINVAL;
> +
> +	leaf = path->nodes[0];
> +	slot = path->slots[0];
> +
> +	btrfs_item_key_to_cpu(leaf, key, slot);
> +	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
> +	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
> +
> +	ASSERT(key->offset == bioc->size);
> +
> +	for (int i = 0; i < num_stripes; i++) {
> +		u64 devid = bioc->stripes[i].dev->devid;
> +		u64 physical = bioc->stripes[i].physical;
> +		u64 length = bioc->stripes[i].length;
> +		struct btrfs_raid_stride *raid_stride =
> +			&stripe_extent->strides[i];
> +
> +		if (length == 0)
> +			length = bioc->size;
> +
> +		btrfs_set_raid_stride_devid(leaf, raid_stride, devid);
> +		btrfs_set_raid_stride_physical(leaf, raid_stride, physical);
> +	}
> +
> +	btrfs_mark_buffer_dirty(trans, leaf);
> +	btrfs_free_path(path);
> +
> +	return ret;
> +}
> +
>   static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
>   					struct btrfs_io_context *bioc)
>   {
> @@ -112,6 +161,8 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
>
>   	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
>   				item_size);
> +	if (ret == -EEXIST)
> +		ret = update_raid_extent_item(trans, &stripe_key, bioc);
>   	if (ret)
>   		btrfs_abort_transaction(trans, ret);
>
>
Johannes Thumshirn July 9, 2024, 7:30 a.m. UTC | #2
On 09.07.24 09:18, Qu Wenruo wrote:
> 
> 
> 在 2024/7/9 16:02, Johannes Thumshirn 写道:
>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> Update stripe extents in case a write to an already existing address
>> incoming.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> Looks good to me.
> 
> Reviewed-by: Qu Wenruo <wqu@suse.com>
> 
> But still as I mentioned in the original thread, I'm wondering why
> dev-replace of RST needs to update RST entry.
> 
> I'd prefer to do a dev-extent level copy so that no RST/chunk needs to
> be updated, just like what we did for non-RST cases.
> 
> But so far the change should be good enough for us to continue the testing.

I /think/ I have a fix for the ASSERT() as well. It survived btrfs/060 
once already (which it hasn't before) and it's trivial and I feel stupid 
for it:

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index fd56535b2289..6b1c6004f94c 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -57,6 +57,9 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle 
*trans, u64 start, u64 le
                 /* That stripe ends before we start, we're done. */
                 if (found_end <= start)
                         break;
+               /* That stripe starts after we end, we're done as well */
+               if (found_start >= end)
+                       break;

                 trace_btrfs_raid_extent_delete(fs_info, start, end,
                                                found_start, found_end);
Qu Wenruo July 9, 2024, 7:46 a.m. UTC | #3
在 2024/7/9 17:00, Johannes Thumshirn 写道:
> On 09.07.24 09:18, Qu Wenruo wrote:
>>
>>
>> 在 2024/7/9 16:02, Johannes Thumshirn 写道:
>>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>>
>>> Update stripe extents in case a write to an already existing address
>>> incoming.
>>>
>>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> Looks good to me.
>>
>> Reviewed-by: Qu Wenruo <wqu@suse.com>
>>
>> But still as I mentioned in the original thread, I'm wondering why
>> dev-replace of RST needs to update RST entry.
>>
>> I'd prefer to do a dev-extent level copy so that no RST/chunk needs to
>> be updated, just like what we did for non-RST cases.
>>
>> But so far the change should be good enough for us to continue the testing.
> 
> I /think/ I have a fix for the ASSERT() as well. It survived btrfs/060
> once already (which it hasn't before) and it's trivial and I feel stupid
> for it:

Wow, it's indeed a little embarrassing, but I'm still a little confused.

> 
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index fd56535b2289..6b1c6004f94c 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -57,6 +57,9 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle
> *trans, u64 start, u64 le
>                   /* That stripe ends before we start, we're done. */

Didn't all the btrfs_delete_raid_extent() callers expects to delete 
exact the range? Thus I though we should always hit 0 from 
btrfs_search_slot().

>                   if (found_end <= start)
>                           break;
> +               /* That stripe starts after we end, we're done as well */
> +               if (found_start >= end)
> +                       break;

Another thing is, just to be safer, you may want to do the RST entry 
search using key.offset = 0 or key.offset = -1, instead of an exact search.

The key.offset == 0 search example can be found in scrub_enumerate_chunk().
And the key.offset == -1 search example can be found in 
btrfs_free_dev_extent().

And do extra length check to ensure we always hit an exact match.

Thanks,
Qu

> 
>                   trace_btrfs_raid_extent_delete(fs_info, start, end,
>                                                  found_start, found_end);
>
Johannes Thumshirn July 9, 2024, 8:44 a.m. UTC | #4
On 09.07.24 09:46, Qu Wenruo wrote:
> 
> 
> 在 2024/7/9 17:00, Johannes Thumshirn 写道:
>> On 09.07.24 09:18, Qu Wenruo wrote:
>>>
>>>
>>> 在 2024/7/9 16:02, Johannes Thumshirn 写道:
>>>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>>>
>>>> Update stripe extents in case a write to an already existing address
>>>> incoming.
>>>>
>>>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>>
>>> Looks good to me.
>>>
>>> Reviewed-by: Qu Wenruo <wqu@suse.com>
>>>
>>> But still as I mentioned in the original thread, I'm wondering why
>>> dev-replace of RST needs to update RST entry.
>>>
>>> I'd prefer to do a dev-extent level copy so that no RST/chunk needs to
>>> be updated, just like what we did for non-RST cases.
>>>
>>> But so far the change should be good enough for us to continue the testing.
>>
>> I /think/ I have a fix for the ASSERT() as well. It survived btrfs/060
>> once already (which it hasn't before) and it's trivial and I feel stupid
>> for it:
> 
> Wow, it's indeed a little embarrassing, but I'm still a little confused.
> 
>>
>> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
>> index fd56535b2289..6b1c6004f94c 100644
>> --- a/fs/btrfs/raid-stripe-tree.c
>> +++ b/fs/btrfs/raid-stripe-tree.c
>> @@ -57,6 +57,9 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle
>> *trans, u64 start, u64 le
>>                    /* That stripe ends before we start, we're done. */
> 
> Didn't all the btrfs_delete_raid_extent() callers expects to delete
> exact the range? Thus I though we should always hit 0 from
> btrfs_search_slot().
> 
>>                    if (found_end <= start)
>>                            break;
>> +               /* That stripe starts after we end, we're done as well */
>> +               if (found_start >= end)
>> +                       break;
> 
> Another thing is, just to be safer, you may want to do the RST entry
> search using key.offset = 0 or key.offset = -1, instead of an exact search.
> 
> The key.offset == 0 search example can be found in scrub_enumerate_chunk().
> And the key.offset == -1 search example can be found in
> btrfs_free_dev_extent().
> 
> And do extra length check to ensure we always hit an exact match.

Ah I didn't know about that one, thanks :).

Currently the above is running through CI and once it completes I'll 
give it a 2nd try with the key.offset = 0 or -1 variant.
diff mbox series

Patch

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index e6f7a234b8f6..fd56535b2289 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -73,6 +73,55 @@  int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 	return ret;
 }
 
+static int update_raid_extent_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_key *key,
+				   struct btrfs_io_context *bioc)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_stripe_extent *stripe_extent;
+	int num_stripes;
+	int ret;
+	int slot;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
+				0, 1);
+	if (ret)
+		return ret == 1 ? ret : -EINVAL;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	btrfs_item_key_to_cpu(leaf, key, slot);
+	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+	ASSERT(key->offset == bioc->size);
+
+	for (int i = 0; i < num_stripes; i++) {
+		u64 devid = bioc->stripes[i].dev->devid;
+		u64 physical = bioc->stripes[i].physical;
+		u64 length = bioc->stripes[i].length;
+		struct btrfs_raid_stride *raid_stride =
+			&stripe_extent->strides[i];
+
+		if (length == 0)
+			length = bioc->size;
+
+		btrfs_set_raid_stride_devid(leaf, raid_stride, devid);
+		btrfs_set_raid_stride_physical(leaf, raid_stride, physical);
+	}
+
+	btrfs_mark_buffer_dirty(trans, leaf);
+	btrfs_free_path(path);
+
+	return ret;
+}
+
 static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
 					struct btrfs_io_context *bioc)
 {
@@ -112,6 +161,8 @@  static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
 				item_size);
+	if (ret == -EEXIST)
+		ret = update_raid_extent_item(trans, &stripe_key, bioc);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);