diff mbox series

[v4,1/7] btrfs: replace stripe extents

Message ID 20240705-b4-rst-updates-v4-1-f3eed3f2cfad@kernel.org (mailing list archive)
State New, archived
Headers show
Series btrfs: rst: updates for RAID stripe tree | expand

Commit Message

Johannes Thumshirn July 5, 2024, 3:13 p.m. UTC
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>

If we can't insert a stripe extent in the RAID stripe tree, because
the key that points to the specific position in the stripe tree is
already existing, we have to remove the item and then replace it by a
new item.

This can happen for example on device replace operations.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/raid-stripe-tree.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

Comments

Qu Wenruo July 5, 2024, 11:19 p.m. UTC | #1
在 2024/7/6 00:43, Johannes Thumshirn 写道:
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>
> If we can't insert a stripe extent in the RAID stripe tree, because
> the key that points to the specific position in the stripe tree is
> already existing, we have to remove the item and then replace it by a
> new item.
>
> This can happen for example on device replace operations.

In that case, can we just modify the targeted dev stripe?

Or do we have other call sites that can lead to such conflicts?

As I'm not that confident if such replace behavior would mask some real
problems.

Thanks,
Qu

>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/raid-stripe-tree.c | 36 ++++++++++++++++++++++++++++++++++++
>   1 file changed, 36 insertions(+)
>
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index e6f7a234b8f6..3b32e96c33fc 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -73,6 +73,39 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
>   	return ret;
>   }
>
> +static int replace_raid_extent_item(struct btrfs_trans_handle *trans,
> +				    struct btrfs_key *key,
> +				    struct btrfs_stripe_extent *stripe_extent,
> +				    const size_t item_size)
> +{
> +	struct btrfs_fs_info *fs_info = trans->fs_info;
> +	struct btrfs_root *stripe_root = fs_info->stripe_root;
> +	struct btrfs_path *path;
> +	int ret;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	ret = btrfs_search_slot(trans, stripe_root, key, path, -1, 1);
> +	if (ret)
> +		goto err;
> +
> +	ret = btrfs_del_item(trans, stripe_root, path);
> +	if (ret) {
> +		ret = (ret == 1) ? -ENOENT : ret;
> +		goto err;
> +	}
> +
> +	btrfs_free_path(path);
> +
> +	return btrfs_insert_item(trans, stripe_root, key, stripe_extent,
> +				 item_size);
> + err:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
>   static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
>   					struct btrfs_io_context *bioc)
>   {
> @@ -112,6 +145,9 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
>
>   	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
>   				item_size);
> +	if (ret == -EEXIST)
> +		ret = replace_raid_extent_item(trans, &stripe_key,
> +					       stripe_extent, item_size);
>   	if (ret)
>   		btrfs_abort_transaction(trans, ret);
>
>
Johannes Thumshirn July 8, 2024, 11:43 a.m. UTC | #2
On 06.07.24 01:19, Qu Wenruo wrote:
> 
> 
> 在 2024/7/6 00:43, Johannes Thumshirn 写道:
>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> If we can't insert a stripe extent in the RAID stripe tree, because
>> the key that points to the specific position in the stripe tree is
>> already existing, we have to remove the item and then replace it by a
>> new item.
>>
>> This can happen for example on device replace operations.
> 
> In that case, can we just modify the targeted dev stripe?
> 
> Or do we have other call sites that can lead to such conflicts?
> 
> As I'm not that confident if such replace behavior would mask some real
> problems.

I've just tested the following patch and it looks like it's working:


diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index e6f7a234b8f6..7bfd8654c110 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -73,6 +73,53 @@ int btrfs_delete_raid_extent(struct 
btrfs_trans_handle *trans, u64 start, u64 le
         return ret;
  }

+static int update_raid_extent_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_key *key,
+				   struct btrfs_io_context *bioc)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_stripe_extent *stripe_extent;
+	int num_stripes;
+	int ret;
+	int slot;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
+				0, 1);
+	if (ret)
+		return ret == 1 ? ret : -EINVAL;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	btrfs_item_key_to_cpu(leaf, key, slot);
+	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+	for (int i = 0; i < num_stripes; i++) {
+		u64 devid = bioc->stripes[i].dev->devid;
+		u64 physical = bioc->stripes[i].physical;
+		u64 length = bioc->stripes[i].length;
+		struct btrfs_raid_stride *raid_stride =
+			&stripe_extent->strides[i];
+
+		if (length == 0)
+			length = bioc->size;
+
+		btrfs_set_raid_stride_devid(leaf, raid_stride, devid);
+		btrfs_set_raid_stride_physical(leaf, raid_stride, physical);
+	}
+
+	btrfs_mark_buffer_dirty(trans, leaf);
+	btrfs_free_path(path);
+
+	return ret;
+}
+
  static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
					struct btrfs_io_context *bioc)
  {
@@ -112,6 +159,8 @@ static int btrfs_insert_one_raid_extent(struct 
btrfs_trans_handle *trans,

	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
				item_size);
+	if (ret == -EEXIST)
+		ret = update_raid_extent_item(trans, &stripe_key, bioc);
	if (ret)
		btrfs_abort_transaction(trans, ret);
Qu Wenruo July 8, 2024, 10:14 p.m. UTC | #3
在 2024/7/8 21:13, Johannes Thumshirn 写道:
> On 06.07.24 01:19, Qu Wenruo wrote:
>>
>>
>> 在 2024/7/6 00:43, Johannes Thumshirn 写道:
>>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>>
>>> If we can't insert a stripe extent in the RAID stripe tree, because
>>> the key that points to the specific position in the stripe tree is
>>> already existing, we have to remove the item and then replace it by a
>>> new item.
>>>
>>> This can happen for example on device replace operations.
>>
>> In that case, can we just modify the targeted dev stripe?
>>
>> Or do we have other call sites that can lead to such conflicts?
>>
>> As I'm not that confident if such replace behavior would mask some real
>> problems.
>
> I've just tested the following patch and it looks like it's working:
>
>
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index e6f7a234b8f6..7bfd8654c110 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -73,6 +73,53 @@ int btrfs_delete_raid_extent(struct
> btrfs_trans_handle *trans, u64 start, u64 le
>           return ret;
>    }
>
> +static int update_raid_extent_item(struct btrfs_trans_handle *trans,
> +				   struct btrfs_key *key,
> +				   struct btrfs_io_context *bioc)
> +{
> +	struct btrfs_path *path;
> +	struct extent_buffer *leaf;
> +	struct btrfs_stripe_extent *stripe_extent;
> +	int num_stripes;
> +	int ret;
> +	int slot;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
> +				0, 1);
> +	if (ret)
> +		return ret == 1 ? ret : -EINVAL;

Looks good to me overall.

Considering in this case the bioc should match the existing rst entry,
can we add an extra ASSERT() to check the length of the entry against
the bioc?

Thanks,
Qu
> +
> +	leaf = path->nodes[0];
> +	slot = path->slots[0];
> +
> +	btrfs_item_key_to_cpu(leaf, key, slot);
> +	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
> +	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
> +
> +	for (int i = 0; i < num_stripes; i++) {
> +		u64 devid = bioc->stripes[i].dev->devid;
> +		u64 physical = bioc->stripes[i].physical;
> +		u64 length = bioc->stripes[i].length;
> +		struct btrfs_raid_stride *raid_stride =
> +			&stripe_extent->strides[i];
> +
> +		if (length == 0)
> +			length = bioc->size;
> +
> +		btrfs_set_raid_stride_devid(leaf, raid_stride, devid);
> +		btrfs_set_raid_stride_physical(leaf, raid_stride, physical);
> +	}
> +
> +	btrfs_mark_buffer_dirty(trans, leaf);
> +	btrfs_free_path(path);
> +
> +	return ret;
> +}
> +
>    static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
> 					struct btrfs_io_context *bioc)
>    {
> @@ -112,6 +159,8 @@ static int btrfs_insert_one_raid_extent(struct
> btrfs_trans_handle *trans,
>
> 	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
> 				item_size);
> +	if (ret == -EEXIST)
> +		ret = update_raid_extent_item(trans, &stripe_key, bioc);
> 	if (ret)
> 		btrfs_abort_transaction(trans, ret);
>
Qu Wenruo July 9, 2024, 5:36 a.m. UTC | #4
在 2024/7/8 21:13, Johannes Thumshirn 写道:
> On 06.07.24 01:19, Qu Wenruo wrote:
>>
>>
>> 在 2024/7/6 00:43, Johannes Thumshirn 写道:
>>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>>
>>> If we can't insert a stripe extent in the RAID stripe tree, because
>>> the key that points to the specific position in the stripe tree is
>>> already existing, we have to remove the item and then replace it by a
>>> new item.
>>>
>>> This can happen for example on device replace operations.
>>
>> In that case, can we just modify the targeted dev stripe?
>>
>> Or do we have other call sites that can lead to such conflicts?
>>
>> As I'm not that confident if such replace behavior would mask some real
>> problems.
>
> I've just tested the following patch and it looks like it's working:

After some more thinking, I'm wondering why dev-replace would even
trigger an RST entry update?

Normally for non-rst replace, we just reuse the scrub routine to read
out all the extents, then only write the content to the replace target,
thus there should be no update to anything (no chunk nor extent level
update).

I understand that for RST we can not directly go that routine, because
the extents' bytenr is no longer directly mapped into a chunk, thus the
data on-disk can be out-of-order and can not be directly used for
dev-replace.


But on the other hand, the extent based iteration is just to avoid
wasting IO, in theory we can just copy the dev extent from one device to
the target device, then everything should work as expected.
(The bg is marked RO, thus no new write should happen there)


Thus I'm wondering, can we just do a device extent level copying for RST
replace.
By that, we can avoid any update to RST entries at all, mirroring the
behavior of non-RST code.

Although the cost is, we have to implement a dedicated RST routine for
device-replace.
As in that case, dev-replace for RST would be something like:

- Scrub the source device dev-extent
- Copy the dev extent for that chunk directly to the target device
   That can only happen if the source dev extent is all correct.

Thanks,
Qu

>
>
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index e6f7a234b8f6..7bfd8654c110 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -73,6 +73,53 @@ int btrfs_delete_raid_extent(struct
> btrfs_trans_handle *trans, u64 start, u64 le
>           return ret;
>    }
>
> +static int update_raid_extent_item(struct btrfs_trans_handle *trans,
> +				   struct btrfs_key *key,
> +				   struct btrfs_io_context *bioc)
> +{
> +	struct btrfs_path *path;
> +	struct extent_buffer *leaf;
> +	struct btrfs_stripe_extent *stripe_extent;
> +	int num_stripes;
> +	int ret;
> +	int slot;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
> +				0, 1);
> +	if (ret)
> +		return ret == 1 ? ret : -EINVAL;
> +
> +	leaf = path->nodes[0];
> +	slot = path->slots[0];
> +
> +	btrfs_item_key_to_cpu(leaf, key, slot);
> +	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
> +	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
> +
> +	for (int i = 0; i < num_stripes; i++) {
> +		u64 devid = bioc->stripes[i].dev->devid;
> +		u64 physical = bioc->stripes[i].physical;
> +		u64 length = bioc->stripes[i].length;
> +		struct btrfs_raid_stride *raid_stride =
> +			&stripe_extent->strides[i];
> +
> +		if (length == 0)
> +			length = bioc->size;
> +
> +		btrfs_set_raid_stride_devid(leaf, raid_stride, devid);
> +		btrfs_set_raid_stride_physical(leaf, raid_stride, physical);
> +	}
> +
> +	btrfs_mark_buffer_dirty(trans, leaf);
> +	btrfs_free_path(path);
> +
> +	return ret;
> +}
> +
>    static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
> 					struct btrfs_io_context *bioc)
>    {
> @@ -112,6 +159,8 @@ static int btrfs_insert_one_raid_extent(struct
> btrfs_trans_handle *trans,
>
> 	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
> 				item_size);
> +	if (ret == -EEXIST)
> +		ret = update_raid_extent_item(trans, &stripe_key, bioc);
> 	if (ret)
> 		btrfs_abort_transaction(trans, ret);
>
Johannes Thumshirn July 9, 2024, 5:49 a.m. UTC | #5
On 09.07.24 00:14, Qu Wenruo wrote:
> Looks good to me overall.
> 
> Considering in this case the bioc should match the existing rst entry,
> can we add an extra ASSERT() to check the length of the entry against
> the bioc?

Sure can do.
diff mbox series

Patch

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index e6f7a234b8f6..3b32e96c33fc 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -73,6 +73,39 @@  int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 	return ret;
 }
 
+static int replace_raid_extent_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_key *key,
+				    struct btrfs_stripe_extent *stripe_extent,
+				    const size_t item_size)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, stripe_root, key, path, -1, 1);
+	if (ret)
+		goto err;
+
+	ret = btrfs_del_item(trans, stripe_root, path);
+	if (ret) {
+		ret = (ret == 1) ? -ENOENT : ret;
+		goto err;
+	}
+
+	btrfs_free_path(path);
+
+	return btrfs_insert_item(trans, stripe_root, key, stripe_extent,
+				 item_size);
+ err:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
 					struct btrfs_io_context *bioc)
 {
@@ -112,6 +145,9 @@  static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
 				item_size);
+	if (ret == -EEXIST)
+		ret = replace_raid_extent_item(trans, &stripe_key,
+					       stripe_extent, item_size);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);