diff mbox series

[v4,3/7] btrfs: split RAID stripes on deletion

Message ID 20240705-b4-rst-updates-v4-3-f3eed3f2cfad@kernel.org (mailing list archive)
State New, archived
Headers show
Series btrfs: rst: updates for RAID stripe tree | expand

Commit Message

Johannes Thumshirn July 5, 2024, 3:13 p.m. UTC
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>

The current RAID stripe code assumes, that we will always remove a
whole stripe entry.

But if we're only removing a part of a RAID stripe we're hitting the
ASSERT()ion checking for this condition.

Instead of assuming the complete deletion of a RAID stripe, split the
stripe if we need to.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/ctree.c            |  1 +
 fs/btrfs/raid-stripe-tree.c | 98 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 75 insertions(+), 24 deletions(-)

Comments

Qu Wenruo July 5, 2024, 11:26 p.m. UTC | #1
在 2024/7/6 00:43, Johannes Thumshirn 写道:
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>
> The current RAID stripe code assumes, that we will always remove a
> whole stripe entry.
>
> But if we're only removing a part of a RAID stripe we're hitting the
> ASSERT()ion checking for this condition.
>
> Instead of assuming the complete deletion of a RAID stripe, split the
> stripe if we need to.

Sorry to be so critical, but if I understand correctly,
btrfs_insert_one_raid_extent() does not do any merge of stripe extent.

Thus one stripe extent always means part of a data extent.

In that case a removal of a data extent should always remove all its
stripe extents.

Furthermore due to the COW nature on zoned/rst devices, the space of a
deleted data extent should not be re-allocated until a transaction
commitment.

Thus I'm wonder if this split is masking some bigger problems.

Thanks,
Qu


>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/ctree.c            |  1 +
>   fs/btrfs/raid-stripe-tree.c | 98 ++++++++++++++++++++++++++++++++++-----------
>   2 files changed, 75 insertions(+), 24 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 451203055bbf..82278e54969e 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -3863,6 +3863,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
>   	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
>
>   	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
> +	       key.type != BTRFS_RAID_STRIPE_KEY &&
>   	       key.type != BTRFS_EXTENT_CSUM_KEY);
>
>   	if (btrfs_leaf_free_space(leaf) >= ins_len)
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index 84746c8886be..d2a6e409b3e8 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -33,42 +33,92 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
>   	if (!path)
>   		return -ENOMEM;
>
> -	while (1) {
> -		key.objectid = start;
> -		key.type = BTRFS_RAID_STRIPE_KEY;
> -		key.offset = length;
> +again:
> +	key.objectid = start;
> +	key.type = BTRFS_RAID_STRIPE_KEY;
> +	key.offset = length;
>
> -		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
> -		if (ret < 0)
> -			break;
> -		if (ret > 0) {
> -			ret = 0;
> -			if (path->slots[0] == 0)
> -				break;
> -			path->slots[0]--;
> -		}
> +	ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
> +	if (ret < 0)
> +		goto out;
> +	if (ret > 0) {
> +		ret = 0;
> +		if (path->slots[0] == 0)
> +			goto out;
> +		path->slots[0]--;
> +	}
> +
> +	leaf = path->nodes[0];
> +	slot = path->slots[0];
> +	btrfs_item_key_to_cpu(leaf, &key, slot);
> +	found_start = key.objectid;
> +	found_end = found_start + key.offset;
> +
> +	/* That stripe ends before we start, we're done. */
> +	if (found_end <= start)
> +		goto out;
> +
> +	trace_btrfs_raid_extent_delete(fs_info, start, end,
> +				       found_start, found_end);
> +
> +	if (found_start < start) {
> +		u64 diff = start - found_start;
> +		struct btrfs_key new_key;
> +		int num_stripes;
> +		struct btrfs_stripe_extent *stripe_extent;
> +
> +		new_key.objectid = start;
> +		new_key.type = BTRFS_RAID_STRIPE_KEY;
> +		new_key.offset = length - diff;
> +
> +		ret = btrfs_duplicate_item(trans, stripe_root, path,
> +					   &new_key);
> +		if (ret)
> +			goto out;
>
>   		leaf = path->nodes[0];
>   		slot = path->slots[0];
> -		btrfs_item_key_to_cpu(leaf, &key, slot);
> -		found_start = key.objectid;
> -		found_end = found_start + key.offset;
>
> -		/* That stripe ends before we start, we're done. */
> -		if (found_end <= start)
> -			break;
> +		num_stripes =
> +			btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
> +		stripe_extent =
> +			btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
> +
> +		for (int i = 0; i < num_stripes; i++) {
> +			struct btrfs_raid_stride *raid_stride =
> +				&stripe_extent->strides[i];
> +			u64 physical =
> +				btrfs_raid_stride_physical(leaf, raid_stride);
> +
> +			btrfs_set_raid_stride_physical(leaf, raid_stride,
> +							     physical + diff);
> +		}
> +
> +		btrfs_mark_buffer_dirty(trans, leaf);
> +		btrfs_release_path(path);
> +		goto again;
> +	}
>
> -		trace_btrfs_raid_extent_delete(fs_info, start, end,
> -					       found_start, found_end);
> +	if (found_end > end) {
> +		u64 diff = found_end - end;
> +		struct btrfs_key new_key;
>
> -		ASSERT(found_start >= start && found_end <= end);
> -		ret = btrfs_del_item(trans, stripe_root, path);
> +		new_key.objectid = found_start;
> +		new_key.type = BTRFS_RAID_STRIPE_KEY;
> +		new_key.offset = length - diff;
> +
> +		ret = btrfs_duplicate_item(trans, stripe_root, path,
> +					   &new_key);
>   		if (ret)
> -			break;
> +			goto out;
>
> +		btrfs_mark_buffer_dirty(trans, leaf);
>   		btrfs_release_path(path);
>   	}
>
> +	ret = btrfs_del_item(trans, stripe_root, path);
> +
> + out:
>   	btrfs_free_path(path);
>   	return ret;
>   }
>
Johannes Thumshirn July 8, 2024, 4:56 a.m. UTC | #2
On 06.07.24 01:26, Qu Wenruo wrote:
> 
> 
> 在 2024/7/6 00:43, Johannes Thumshirn 写道:
>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> The current RAID stripe code assumes, that we will always remove a
>> whole stripe entry.
>>
>> But if we're only removing a part of a RAID stripe we're hitting the
>> ASSERT()ion checking for this condition.
>>
>> Instead of assuming the complete deletion of a RAID stripe, split the
>> stripe if we need to.
> 
> Sorry to be so critical, but if I understand correctly,
> btrfs_insert_one_raid_extent() does not do any merge of stripe extent.

No problem at all. I want to solve bugs, not increase my patch count ;).

> 
> Thus one stripe extent always means part of a data extent.
> 
> In that case a removal of a data extent should always remove all its
> stripe extents.
> 
> Furthermore due to the COW nature on zoned/rst devices, the space of a
> deleted data extent should not be re-allocated until a transaction
> commitment.
> 
> Thus I'm wonder if this split is masking some bigger problems.

Hmm now that you're saying it. The reason I wrote this path is, that I 
did hit the following ASSERT() in my testing:

>> -		ASSERT(found_start >= start && found_end <= end);

This indicates a partial delete of a stripe extent. But I agree as 
stripe extents are tied to extent items, this shouldn't really happen.

So maybe most of this series (apart from the deadlock fix) masks problems?

I'm back to the drawing board :(.
Qu Wenruo July 8, 2024, 5:20 a.m. UTC | #3
在 2024/7/8 14:26, Johannes Thumshirn 写道:
> On 06.07.24 01:26, Qu Wenruo wrote:
>>
>>
>> 在 2024/7/6 00:43, Johannes Thumshirn 写道:
>>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>>
>>> The current RAID stripe code assumes, that we will always remove a
>>> whole stripe entry.
>>>
>>> But if we're only removing a part of a RAID stripe we're hitting the
>>> ASSERT()ion checking for this condition.
>>>
>>> Instead of assuming the complete deletion of a RAID stripe, split the
>>> stripe if we need to.
>>
>> Sorry to be so critical, but if I understand correctly,
>> btrfs_insert_one_raid_extent() does not do any merge of stripe extent.
>
> No problem at all. I want to solve bugs, not increase my patch count ;).
>
>>
>> Thus one stripe extent always means part of a data extent.
>>
>> In that case a removal of a data extent should always remove all its
>> stripe extents.
>>
>> Furthermore due to the COW nature on zoned/rst devices, the space of a
>> deleted data extent should not be re-allocated until a transaction
>> commitment.
>>
>> Thus I'm wonder if this split is masking some bigger problems.
>
> Hmm now that you're saying it. The reason I wrote this path is, that I
> did hit the following ASSERT() in my testing:
>
>>> -		ASSERT(found_start >= start && found_end <= end);
>
> This indicates a partial delete of a stripe extent. But I agree as
> stripe extents are tied to extent items, this shouldn't really happen.
>
> So maybe most of this series (apart from the deadlock fix) masks problems?
>
> I'm back to the drawing board :(.

Can the ASSERT() be reproduced without a zoned device? (I'm really not a
fan of the existing tcmu emulated solution, meanwhile libvirt still
doesn't support ZNS devices)

If it can be reproduced just with RST feature, I may provide some help
digging into the ASSERT().

Thanks,
Qu
Johannes Thumshirn July 8, 2024, 5:25 a.m. UTC | #4
On 08.07.24 07:20, Qu Wenruo wrote:
> 
> Can the ASSERT() be reproduced without a zoned device? (I'm really not a
> fan of the existing tcmu emulated solution, meanwhile libvirt still
> doesn't support ZNS devices)
> 
> If it can be reproduced just with RST feature, I may provide some help
> digging into the ASSERT().

Let me check. It's very sporadic as well unfortunately.
Johannes Thumshirn July 8, 2024, 10:52 a.m. UTC | #5
On 08.07.24 07:26, Johannes Thumshirn wrote:
> On 08.07.24 07:20, Qu Wenruo wrote:
>>
>> Can the ASSERT() be reproduced without a zoned device? (I'm really not a
>> fan of the existing tcmu emulated solution, meanwhile libvirt still
>> doesn't support ZNS devices)
>>
>> If it can be reproduced just with RST feature, I may provide some help
>> digging into the ASSERT().
> 
> Let me check. It's very sporadic as well unfortunately.
> 
> 

OK, I've managed to trigger the failure with btrfs/070 on a 
SCRATCH_DEV_POOL with 5 non-zoned devices.
Qu Wenruo July 8, 2024, 11:02 p.m. UTC | #6
在 2024/7/8 20:22, Johannes Thumshirn 写道:
> On 08.07.24 07:26, Johannes Thumshirn wrote:
>> On 08.07.24 07:20, Qu Wenruo wrote:
>>>
>>> Can the ASSERT() be reproduced without a zoned device? (I'm really not a
>>> fan of the existing tcmu emulated solution, meanwhile libvirt still
>>> doesn't support ZNS devices)
>>>
>>> If it can be reproduced just with RST feature, I may provide some help
>>> digging into the ASSERT().
>>
>> Let me check. It's very sporadic as well unfortunately.
>>
>>
>
> OK, I've managed to trigger the failure with btrfs/070 on a
> SCRATCH_DEV_POOL with 5 non-zoned devices.

I'm hitting errors like this:

[  227.898320] ------------[ cut here ]------------
[  227.898817] BTRFS: Transaction aborted (error -17)
[  227.899250] WARNING: CPU: 7 PID: 65 at
fs/btrfs/raid-stripe-tree.c:116 btrfs_insert_raid_extent+0x337/0x3d0 [btrfs]
[  227.900122] Modules linked in: btrfs blake2b_generic xor
zstd_compress vfat fat intel_rapl_msr intel_rapl_common crct10dif_pclmul
crc32_pclmul ghash_clmulni_intel iTCO_wdt iTCO_vendor_support
aesni_intel crypto_simd cryptd psmouse i2c_i801 pcspkr i2c_smbus lpc_ich
intel_agp intel_gtt joydev agpgart mousedev raid6_pq libcrc32c loop drm
fuse qemu_fw_cfg ext4 crc32c_generic crc16 mbcache jbd2 dm_mod
virtio_rng virtio_net virtio_blk virtio_balloon net_failover
virtio_console failover virtio_scsi rng_core dimlib usbhid virtio_pci
virtio_pci_legacy_dev crc32c_intel virtio_pci_modern_dev serio_raw
[  227.904452] CPU: 7 PID: 65 Comm: kworker/u40:0 Not tainted
6.10.0-rc6-custom+ #167
[  227.905220] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
unknown 2/2/2022
[  227.905827] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[  227.906558] RIP: 0010:btrfs_insert_raid_extent+0x337/0x3d0 [btrfs]
[  227.907246] Code: 89 6b 08 e8 4b 18 f7 ff 49 8b 84 24 50 02 00 00 4c
39 f0 75 be 31 db e9 7d fe ff ff 89 de 48 c7 c7 f0 8d 9d a0 e8 29 a1 79
e0 <0f> 0b e9 69 ff ff ff e8 bd 95 3e e1 49 8b 46 60 48 05 48 1a 00 00
[  227.908277] BTRFS: error (device dm-3 state A) in
btrfs_insert_one_raid_extent:116: errno=-17 Object already exists
[  227.909356] RSP: 0018:ffffc9000026fca0 EFLAGS: 00010282
[  227.909361] RAX: 0000000000000000 RBX: 00000000ffffffef RCX:
0000000000000027
[  227.911934] RDX: ffff88817bda1948 RSI: 0000000000000001 RDI:
ffff88817bda1940
[  227.912722] RBP: ffff8881029dcbe0 R08: 0000000000000000 R09:
0000000000000003
[  227.913095] BTRFS info (device dm-3 state EA): forced readonly
[  227.913569] R10: ffffc9000026fb38 R11: ffffffff826d0508 R12:
0000000000000010
[  227.915182] R13: ffff8881029dcbe0 R14: ffff88812a5ff790 R15:
ffff8881488f2780
[  227.916130] FS:  0000000000000000(0000) GS:ffff88817bd80000(0000)
knlGS:0000000000000000
[  227.916912] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  227.917500] CR2: 0000561364dec000 CR3: 00000001583ca000 CR4:
0000000000750ef0
[  227.918210] PKRU: 55555554
[  227.918484] Call Trace:
[  227.918727]  <TASK>
[  227.918940]  ? __warn+0x8c/0x180
[  227.919299]  ? btrfs_insert_raid_extent+0x337/0x3d0 [btrfs]
[  227.919891]  ? report_bug+0x164/0x190
[  227.920272]  ? prb_read_valid+0x1b/0x30
[  227.920666]  ? handle_bug+0x3c/0x80
[  227.921013]  ? exc_invalid_op+0x17/0x70
[  227.921397]  ? asm_exc_invalid_op+0x1a/0x20
[  227.921835]  ? btrfs_insert_raid_extent+0x337/0x3d0 [btrfs]
[  227.922440]  btrfs_finish_one_ordered+0x3c3/0xaa0 [btrfs]
[  227.923055]  ? srso_alias_return_thunk+0x5/0xfbef5
[  227.923549]  ? srso_alias_return_thunk+0x5/0xfbef5
[  227.924062]  btrfs_work_helper+0x107/0x4c0 [btrfs]
[  227.924612]  ? lock_is_held_type+0x9a/0x110
[  227.925040]  process_one_work+0x212/0x720
[  227.925454]  ? srso_alias_return_thunk+0x5/0xfbef5
[  227.926010]  worker_thread+0x1dc/0x3b0
[  227.926411]  ? __pfx_worker_thread+0x10/0x10
[  227.926918]  kthread+0xe0/0x110
[  227.927377]  ? __pfx_kthread+0x10/0x10
[  227.927776]  ret_from_fork+0x31/0x50
[  227.928151]  ? __pfx_kthread+0x10/0x10
[  227.928564]  ret_from_fork_asm+0x1a/0x30
[  227.929035]  </TASK>
[  227.929305] irq event stamp: 11077
[  227.929710] hardirqs last  enabled at (11085): [<ffffffff8115daf5>]
console_unlock+0x135/0x160
[  227.930725] hardirqs last disabled at (11094): [<ffffffff8115dada>]
console_unlock+0x11a/0x160
[  227.931730] softirqs last  enabled at (10728): [<ffffffff810b4684>]
__irq_exit_rcu+0x84/0xa0
[  227.932568] softirqs last disabled at (10723): [<ffffffff810b4684>]
__irq_exit_rcu+0x84/0xa0
[  227.933494] ---[ end trace 0000000000000000 ]---
[  227.933992] BTRFS: error (device dm-3 state EA) in
btrfs_insert_one_raid_extent:116: errno=-17 Object already exists
[  227.935193] BTRFS warning (device dm-3 state EA): Skipping commit of
aborted transaction.
[  227.936383] BTRFS: error (device dm-3 state EA) in
cleanup_transaction:2018: errno=-17 Object already exists

But not that ASSERT() yet.

I guess I need the first patch to pass this first?

Thanks,
Qu
Johannes Thumshirn July 9, 2024, 5:51 a.m. UTC | #7
On 09.07.24 01:02, Qu Wenruo wrote:
> But not that ASSERT() yet.
> 
> I guess I need the first patch to pass this first?

Yes, otherwise the EEXIST path will be triggered more frequently.
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 451203055bbf..82278e54969e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3863,6 +3863,7 @@  static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+	       key.type != BTRFS_RAID_STRIPE_KEY &&
 	       key.type != BTRFS_EXTENT_CSUM_KEY);
 
 	if (btrfs_leaf_free_space(leaf) >= ins_len)
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 84746c8886be..d2a6e409b3e8 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -33,42 +33,92 @@  int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 	if (!path)
 		return -ENOMEM;
 
-	while (1) {
-		key.objectid = start;
-		key.type = BTRFS_RAID_STRIPE_KEY;
-		key.offset = length;
+again:
+	key.objectid = start;
+	key.type = BTRFS_RAID_STRIPE_KEY;
+	key.offset = length;
 
-		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
-		if (ret < 0)
-			break;
-		if (ret > 0) {
-			ret = 0;
-			if (path->slots[0] == 0)
-				break;
-			path->slots[0]--;
-		}
+	ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = 0;
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	found_start = key.objectid;
+	found_end = found_start + key.offset;
+
+	/* That stripe ends before we start, we're done. */
+	if (found_end <= start)
+		goto out;
+
+	trace_btrfs_raid_extent_delete(fs_info, start, end,
+				       found_start, found_end);
+
+	if (found_start < start) {
+		u64 diff = start - found_start;
+		struct btrfs_key new_key;
+		int num_stripes;
+		struct btrfs_stripe_extent *stripe_extent;
+
+		new_key.objectid = start;
+		new_key.type = BTRFS_RAID_STRIPE_KEY;
+		new_key.offset = length - diff;
+
+		ret = btrfs_duplicate_item(trans, stripe_root, path,
+					   &new_key);
+		if (ret)
+			goto out;
 
 		leaf = path->nodes[0];
 		slot = path->slots[0];
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		found_start = key.objectid;
-		found_end = found_start + key.offset;
 
-		/* That stripe ends before we start, we're done. */
-		if (found_end <= start)
-			break;
+		num_stripes =
+			btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+		stripe_extent =
+			btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+		for (int i = 0; i < num_stripes; i++) {
+			struct btrfs_raid_stride *raid_stride =
+				&stripe_extent->strides[i];
+			u64 physical =
+				btrfs_raid_stride_physical(leaf, raid_stride);
+
+			btrfs_set_raid_stride_physical(leaf, raid_stride,
+							     physical + diff);
+		}
+
+		btrfs_mark_buffer_dirty(trans, leaf);
+		btrfs_release_path(path);
+		goto again;
+	}
 
-		trace_btrfs_raid_extent_delete(fs_info, start, end,
-					       found_start, found_end);
+	if (found_end > end) {
+		u64 diff = found_end - end;
+		struct btrfs_key new_key;
 
-		ASSERT(found_start >= start && found_end <= end);
-		ret = btrfs_del_item(trans, stripe_root, path);
+		new_key.objectid = found_start;
+		new_key.type = BTRFS_RAID_STRIPE_KEY;
+		new_key.offset = length - diff;
+
+		ret = btrfs_duplicate_item(trans, stripe_root, path,
+					   &new_key);
 		if (ret)
-			break;
+			goto out;
 
+		btrfs_mark_buffer_dirty(trans, leaf);
 		btrfs_release_path(path);
 	}
 
+	ret = btrfs_del_item(trans, stripe_root, path);
+
+ out:
 	btrfs_free_path(path);
 	return ret;
 }