diff mbox series

[v2,08/14] btrfs: don't use btrfs_set_item_key_safe on RAID stripe-extents

Message ID 20250107-rst-delete-fixes-v2-8-0c7b14c0aac2@kernel.org (mailing list archive)
State New, archived
Headers show
Series btrfs: more RST delete fixes | expand

Commit Message

Johannes Thumshirn Jan. 7, 2025, 12:47 p.m. UTC
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>

Don't use btrfs_set_item_key_safe() to modify the keys in the RAID
stripe-tree as this can lead to corruption of the tree, which is caught by
the checks in btrfs_set_item_key_safe():

 BTRFS info (device nvme1n1): leaf 49168384 gen 15 total ptrs 194 free space 8329 owner 12
 BTRFS info (device nvme1n1): refs 2 lock_owner 1030 current 1030
  [ snip ]
  item 105 key (354549760 230 20480) itemoff 14587 itemsize 16
                  stride 0 devid 5 physical 67502080
  item 106 key (354631680 230 4096) itemoff 14571 itemsize 16
                  stride 0 devid 1 physical 88559616
  item 107 key (354631680 230 32768) itemoff 14555 itemsize 16
                  stride 0 devid 1 physical 88555520
  item 108 key (354717696 230 28672) itemoff 14539 itemsize 16
                  stride 0 devid 2 physical 67604480
  [ snip ]
 BTRFS critical (device nvme1n1): slot 106 key (354631680 230 32768) new key (354635776 230 4096)
 ------------[ cut here ]------------
 kernel BUG at fs/btrfs/ctree.c:2602!
 Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
 CPU: 1 UID: 0 PID: 1055 Comm: fsstress Not tainted 6.13.0-rc1+ #1464
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014
 RIP: 0010:btrfs_set_item_key_safe+0xf7/0x270
 Code: <snip>
 RSP: 0018:ffffc90001337ab0 EFLAGS: 00010287
 RAX: 0000000000000000 RBX: ffff8881115fd000 RCX: 0000000000000000
 RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff
 RBP: ffff888110ed6f50 R08: 00000000ffffefff R09: ffffffff8244c500
 R10: 00000000ffffefff R11: 00000000ffffffff R12: ffff888100586000
 R13: 00000000000000c9 R14: ffffc90001337b1f R15: ffff888110f23b58
 FS:  00007f7d75c72740(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007fa811652c60 CR3: 0000000111398001 CR4: 0000000000370eb0
 Call Trace:
  <TASK>
  ? __die_body.cold+0x14/0x1a
  ? die+0x2e/0x50
  ? do_trap+0xca/0x110
  ? do_error_trap+0x65/0x80
  ? btrfs_set_item_key_safe+0xf7/0x270
  ? exc_invalid_op+0x50/0x70
  ? btrfs_set_item_key_safe+0xf7/0x270
  ? asm_exc_invalid_op+0x1a/0x20
  ? btrfs_set_item_key_safe+0xf7/0x270
  btrfs_partially_delete_raid_extent+0xc4/0xe0
  btrfs_delete_raid_extent+0x227/0x240
  __btrfs_free_extent.isra.0+0x57f/0x9c0
  ? exc_coproc_segment_overrun+0x40/0x40
  __btrfs_run_delayed_refs+0x2fa/0xe80
  btrfs_run_delayed_refs+0x81/0xe0
  btrfs_commit_transaction+0x2dd/0xbe0
  ? preempt_count_add+0x52/0xb0
  btrfs_sync_file+0x375/0x4c0
  do_fsync+0x39/0x70
  __x64_sys_fsync+0x13/0x20
  do_syscall_64+0x54/0x110
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7f7d7550ef90
 Code: <snip>
 RSP: 002b:00007ffd70237248 EFLAGS: 00000202 ORIG_RAX: 000000000000004a
 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f7d7550ef90
 RDX: 000000000000013a RSI: 000000000040eb28 RDI: 0000000000000004
 RBP: 000000000000001b R08: 0000000000000078 R09: 00007ffd7023725c
 R10: 00007f7d75400390 R11: 0000000000000202 R12: 028f5c28f5c28f5c
 R13: 8f5c28f5c28f5c29 R14: 000000000040b520 R15: 00007f7d75c726c8
  </TASK>

Instead copy the item, adjust the key and per-device physical addresses
and re-insert it into the tree.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/raid-stripe-tree.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

Comments

David Sterba Jan. 9, 2025, 10:50 a.m. UTC | #1
On Tue, Jan 07, 2025 at 01:47:38PM +0100, Johannes Thumshirn wrote:
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> Don't use btrfs_set_item_key_safe() to modify the keys in the RAID
> stripe-tree as this can lead to corruption of the tree, which is caught by
> the checks in btrfs_set_item_key_safe():
> 
>  BTRFS info (device nvme1n1): leaf 49168384 gen 15 total ptrs 194 free space 8329 owner 12
>  BTRFS info (device nvme1n1): refs 2 lock_owner 1030 current 1030
>   [ snip ]
>   item 105 key (354549760 230 20480) itemoff 14587 itemsize 16
>                   stride 0 devid 5 physical 67502080
>   item 106 key (354631680 230 4096) itemoff 14571 itemsize 16
>                   stride 0 devid 1 physical 88559616
>   item 107 key (354631680 230 32768) itemoff 14555 itemsize 16
>                   stride 0 devid 1 physical 88555520
>   item 108 key (354717696 230 28672) itemoff 14539 itemsize 16
>                   stride 0 devid 2 physical 67604480
>   [ snip ]
>  BTRFS critical (device nvme1n1): slot 106 key (354631680 230 32768) new key (354635776 230 4096)
>  ------------[ cut here ]------------
>  kernel BUG at fs/btrfs/ctree.c:2602!
>  Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
>  CPU: 1 UID: 0 PID: 1055 Comm: fsstress Not tainted 6.13.0-rc1+ #1464
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014
>  RIP: 0010:btrfs_set_item_key_safe+0xf7/0x270
>  Code: <snip>
>  RSP: 0018:ffffc90001337ab0 EFLAGS: 00010287
>  RAX: 0000000000000000 RBX: ffff8881115fd000 RCX: 0000000000000000
>  RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff
>  RBP: ffff888110ed6f50 R08: 00000000ffffefff R09: ffffffff8244c500
>  R10: 00000000ffffefff R11: 00000000ffffffff R12: ffff888100586000
>  R13: 00000000000000c9 R14: ffffc90001337b1f R15: ffff888110f23b58
>  FS:  00007f7d75c72740(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: 00007fa811652c60 CR3: 0000000111398001 CR4: 0000000000370eb0
>  Call Trace:
>   <TASK>
>   ? __die_body.cold+0x14/0x1a
>   ? die+0x2e/0x50
>   ? do_trap+0xca/0x110
>   ? do_error_trap+0x65/0x80
>   ? btrfs_set_item_key_safe+0xf7/0x270
>   ? exc_invalid_op+0x50/0x70
>   ? btrfs_set_item_key_safe+0xf7/0x270
>   ? asm_exc_invalid_op+0x1a/0x20
>   ? btrfs_set_item_key_safe+0xf7/0x270
>   btrfs_partially_delete_raid_extent+0xc4/0xe0
>   btrfs_delete_raid_extent+0x227/0x240
>   __btrfs_free_extent.isra.0+0x57f/0x9c0
>   ? exc_coproc_segment_overrun+0x40/0x40
>   __btrfs_run_delayed_refs+0x2fa/0xe80
>   btrfs_run_delayed_refs+0x81/0xe0
>   btrfs_commit_transaction+0x2dd/0xbe0
>   ? preempt_count_add+0x52/0xb0
>   btrfs_sync_file+0x375/0x4c0
>   do_fsync+0x39/0x70
>   __x64_sys_fsync+0x13/0x20
>   do_syscall_64+0x54/0x110
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>  RIP: 0033:0x7f7d7550ef90
>  Code: <snip>
>  RSP: 002b:00007ffd70237248 EFLAGS: 00000202 ORIG_RAX: 000000000000004a
>  RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f7d7550ef90
>  RDX: 000000000000013a RSI: 000000000040eb28 RDI: 0000000000000004
>  RBP: 000000000000001b R08: 0000000000000078 R09: 00007ffd7023725c
>  R10: 00007f7d75400390 R11: 0000000000000202 R12: 028f5c28f5c28f5c
>  R13: 8f5c28f5c28f5c29 R14: 000000000040b520 R15: 00007f7d75c726c8
>   </TASK>
> 
> Instead copy the item, adjust the key and per-device physical addresses
> and re-insert it into the tree.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/raid-stripe-tree.c | 26 +++++++++++++++++++++-----
>  1 file changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index d15df49c61a86a4188b822b05453428e444920b5..a4225ad043216e5d7035a71eab6bcc49b242836f 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -13,12 +13,13 @@
>  #include "volumes.h"
>  #include "print-tree.h"
>  
> -static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
> +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
>  					       struct btrfs_path *path,
>  					       const struct btrfs_key *oldkey,
>  					       u64 newlen, u64 frontpad)
>  {
> -	struct btrfs_stripe_extent *extent;
> +	struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
> +	struct btrfs_stripe_extent *extent, *new;

Maybe call it 'newitem' so it's clear that it's related to newlen and
newkey.

>  	struct extent_buffer *leaf;
>  	int slot;
>  	size_t item_size;
Filipe Manana Jan. 9, 2025, 3:44 p.m. UTC | #2
On Tue, Jan 7, 2025 at 12:59 PM Johannes Thumshirn <jth@kernel.org> wrote:
>
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>
> Don't use btrfs_set_item_key_safe() to modify the keys in the RAID
> stripe-tree as this can lead to corruption of the tree, which is caught by
> the checks in btrfs_set_item_key_safe():
>
>  BTRFS info (device nvme1n1): leaf 49168384 gen 15 total ptrs 194 free space 8329 owner 12
>  BTRFS info (device nvme1n1): refs 2 lock_owner 1030 current 1030
>   [ snip ]
>   item 105 key (354549760 230 20480) itemoff 14587 itemsize 16
>                   stride 0 devid 5 physical 67502080
>   item 106 key (354631680 230 4096) itemoff 14571 itemsize 16
>                   stride 0 devid 1 physical 88559616
>   item 107 key (354631680 230 32768) itemoff 14555 itemsize 16
>                   stride 0 devid 1 physical 88555520
>   item 108 key (354717696 230 28672) itemoff 14539 itemsize 16
>                   stride 0 devid 2 physical 67604480
>   [ snip ]
>  BTRFS critical (device nvme1n1): slot 106 key (354631680 230 32768) new key (354635776 230 4096)
>  ------------[ cut here ]------------
>  kernel BUG at fs/btrfs/ctree.c:2602!
>  Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
>  CPU: 1 UID: 0 PID: 1055 Comm: fsstress Not tainted 6.13.0-rc1+ #1464
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014
>  RIP: 0010:btrfs_set_item_key_safe+0xf7/0x270
>  Code: <snip>
>  RSP: 0018:ffffc90001337ab0 EFLAGS: 00010287
>  RAX: 0000000000000000 RBX: ffff8881115fd000 RCX: 0000000000000000
>  RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff
>  RBP: ffff888110ed6f50 R08: 00000000ffffefff R09: ffffffff8244c500
>  R10: 00000000ffffefff R11: 00000000ffffffff R12: ffff888100586000
>  R13: 00000000000000c9 R14: ffffc90001337b1f R15: ffff888110f23b58
>  FS:  00007f7d75c72740(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: 00007fa811652c60 CR3: 0000000111398001 CR4: 0000000000370eb0
>  Call Trace:
>   <TASK>
>   ? __die_body.cold+0x14/0x1a
>   ? die+0x2e/0x50
>   ? do_trap+0xca/0x110
>   ? do_error_trap+0x65/0x80
>   ? btrfs_set_item_key_safe+0xf7/0x270
>   ? exc_invalid_op+0x50/0x70
>   ? btrfs_set_item_key_safe+0xf7/0x270
>   ? asm_exc_invalid_op+0x1a/0x20
>   ? btrfs_set_item_key_safe+0xf7/0x270
>   btrfs_partially_delete_raid_extent+0xc4/0xe0
>   btrfs_delete_raid_extent+0x227/0x240
>   __btrfs_free_extent.isra.0+0x57f/0x9c0
>   ? exc_coproc_segment_overrun+0x40/0x40
>   __btrfs_run_delayed_refs+0x2fa/0xe80
>   btrfs_run_delayed_refs+0x81/0xe0
>   btrfs_commit_transaction+0x2dd/0xbe0
>   ? preempt_count_add+0x52/0xb0
>   btrfs_sync_file+0x375/0x4c0
>   do_fsync+0x39/0x70
>   __x64_sys_fsync+0x13/0x20
>   do_syscall_64+0x54/0x110
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>  RIP: 0033:0x7f7d7550ef90
>  Code: <snip>
>  RSP: 002b:00007ffd70237248 EFLAGS: 00000202 ORIG_RAX: 000000000000004a
>  RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f7d7550ef90
>  RDX: 000000000000013a RSI: 000000000040eb28 RDI: 0000000000000004
>  RBP: 000000000000001b R08: 0000000000000078 R09: 00007ffd7023725c
>  R10: 00007f7d75400390 R11: 0000000000000202 R12: 028f5c28f5c28f5c
>  R13: 8f5c28f5c28f5c29 R14: 000000000040b520 R15: 00007f7d75c726c8
>   </TASK>
>
> Instead copy the item, adjust the key and per-device physical addresses
> and re-insert it into the tree.

So my comments are basically the same as in the previous version.
Why do we hit this situation, what's the bug in the algorithm that
makes us try to set a key that breaks the key ordering in the leaf?

Did this happen even with all previous fixes applied?

Looking at this change log I'm reading it as "not sure what causes the
bug, so switching to a delete + insert as that always results in a
correct key order".

Thanks.


>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/raid-stripe-tree.c | 26 +++++++++++++++++++++-----
>  1 file changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index d15df49c61a86a4188b822b05453428e444920b5..a4225ad043216e5d7035a71eab6bcc49b242836f 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -13,12 +13,13 @@
>  #include "volumes.h"
>  #include "print-tree.h"
>
> -static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
> +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
>                                                struct btrfs_path *path,
>                                                const struct btrfs_key *oldkey,
>                                                u64 newlen, u64 frontpad)
>  {
> -       struct btrfs_stripe_extent *extent;
> +       struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
> +       struct btrfs_stripe_extent *extent, *new;
>         struct extent_buffer *leaf;
>         int slot;
>         size_t item_size;
> @@ -27,6 +28,7 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
>                 .type = BTRFS_RAID_STRIPE_KEY,
>                 .offset = newlen,
>         };
> +       int ret;
>
>         ASSERT(newlen > 0);
>         ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
> @@ -34,17 +36,31 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
>         leaf = path->nodes[0];
>         slot = path->slots[0];
>         item_size = btrfs_item_size(leaf, slot);
> +
> +       new = kzalloc(item_size, GFP_NOFS);
> +       if (!new)
> +               return -ENOMEM;
> +
>         extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
>
>         for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
>                 struct btrfs_raid_stride *stride = &extent->strides[i];
>                 u64 phys;
>
> -               phys = btrfs_raid_stride_physical(leaf, stride);
> -               btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
> +               phys = btrfs_raid_stride_physical(leaf, stride) + frontpad;
> +               btrfs_set_stack_raid_stride_physical(&new->strides[i], phys);
>         }
>
> -       btrfs_set_item_key_safe(trans, path, &newkey);
> +       ret = btrfs_del_item(trans, stripe_root, path);
> +       if (ret)
> +               goto out;
> +
> +       btrfs_release_path(path);
> +       ret = btrfs_insert_item(trans, stripe_root, &newkey, new, item_size);
> +
> +out:
> +       kfree(new);
> +       return ret;
>  }
>
>  int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
>
> --
> 2.43.0
>
>
Johannes Thumshirn Jan. 9, 2025, 4 p.m. UTC | #3
On 09.01.25 16:45, Filipe Manana wrote:
> On Tue, Jan 7, 2025 at 12:59 PM Johannes Thumshirn <jth@kernel.org> wrote:
>>
>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> Don't use btrfs_set_item_key_safe() to modify the keys in the RAID
>> stripe-tree as this can lead to corruption of the tree, which is caught by
>> the checks in btrfs_set_item_key_safe():
>>
>>   BTRFS info (device nvme1n1): leaf 49168384 gen 15 total ptrs 194 free space 8329 owner 12
>>   BTRFS info (device nvme1n1): refs 2 lock_owner 1030 current 1030
>>    [ snip ]
>>    item 105 key (354549760 230 20480) itemoff 14587 itemsize 16
>>                    stride 0 devid 5 physical 67502080
>>    item 106 key (354631680 230 4096) itemoff 14571 itemsize 16
>>                    stride 0 devid 1 physical 88559616
>>    item 107 key (354631680 230 32768) itemoff 14555 itemsize 16
>>                    stride 0 devid 1 physical 88555520
>>    item 108 key (354717696 230 28672) itemoff 14539 itemsize 16
>>                    stride 0 devid 2 physical 67604480
>>    [ snip ]
>>   BTRFS critical (device nvme1n1): slot 106 key (354631680 230 32768) new key (354635776 230 4096)
>>   ------------[ cut here ]------------
>>   kernel BUG at fs/btrfs/ctree.c:2602!
>>   Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
>>   CPU: 1 UID: 0 PID: 1055 Comm: fsstress Not tainted 6.13.0-rc1+ #1464
>>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014
>>   RIP: 0010:btrfs_set_item_key_safe+0xf7/0x270
>>   Code: <snip>
>>   RSP: 0018:ffffc90001337ab0 EFLAGS: 00010287
>>   RAX: 0000000000000000 RBX: ffff8881115fd000 RCX: 0000000000000000
>>   RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff
>>   RBP: ffff888110ed6f50 R08: 00000000ffffefff R09: ffffffff8244c500
>>   R10: 00000000ffffefff R11: 00000000ffffffff R12: ffff888100586000
>>   R13: 00000000000000c9 R14: ffffc90001337b1f R15: ffff888110f23b58
>>   FS:  00007f7d75c72740(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
>>   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>   CR2: 00007fa811652c60 CR3: 0000000111398001 CR4: 0000000000370eb0
>>   Call Trace:
>>    <TASK>
>>    ? __die_body.cold+0x14/0x1a
>>    ? die+0x2e/0x50
>>    ? do_trap+0xca/0x110
>>    ? do_error_trap+0x65/0x80
>>    ? btrfs_set_item_key_safe+0xf7/0x270
>>    ? exc_invalid_op+0x50/0x70
>>    ? btrfs_set_item_key_safe+0xf7/0x270
>>    ? asm_exc_invalid_op+0x1a/0x20
>>    ? btrfs_set_item_key_safe+0xf7/0x270
>>    btrfs_partially_delete_raid_extent+0xc4/0xe0
>>    btrfs_delete_raid_extent+0x227/0x240
>>    __btrfs_free_extent.isra.0+0x57f/0x9c0
>>    ? exc_coproc_segment_overrun+0x40/0x40
>>    __btrfs_run_delayed_refs+0x2fa/0xe80
>>    btrfs_run_delayed_refs+0x81/0xe0
>>    btrfs_commit_transaction+0x2dd/0xbe0
>>    ? preempt_count_add+0x52/0xb0
>>    btrfs_sync_file+0x375/0x4c0
>>    do_fsync+0x39/0x70
>>    __x64_sys_fsync+0x13/0x20
>>    do_syscall_64+0x54/0x110
>>    entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>   RIP: 0033:0x7f7d7550ef90
>>   Code: <snip>
>>   RSP: 002b:00007ffd70237248 EFLAGS: 00000202 ORIG_RAX: 000000000000004a
>>   RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f7d7550ef90
>>   RDX: 000000000000013a RSI: 000000000040eb28 RDI: 0000000000000004
>>   RBP: 000000000000001b R08: 0000000000000078 R09: 00007ffd7023725c
>>   R10: 00007f7d75400390 R11: 0000000000000202 R12: 028f5c28f5c28f5c
>>   R13: 8f5c28f5c28f5c29 R14: 000000000040b520 R15: 00007f7d75c726c8
>>    </TASK>
>>
>> Instead copy the item, adjust the key and per-device physical addresses
>> and re-insert it into the tree.
> 
> So my comments are basically the same as in the previous version.
> Why do we hit this situation, what's the bug in the algorithm that
> makes us try to set a key that breaks the key ordering in the leaf?
> 
> Did this happen even with all previous fixes applied?

Yes.

> Looking at this change log I'm reading it as "not sure what causes the
> bug, so switching to a delete + insert as that always results in a
> correct key order".

Correct, also looking at btrfs_drop_extents() it only uses 
btrfs_set_item_key_safe() when truncating an extent, AFAIU.
Otherwise btrfs_drop_extents() uses btrfs_duplicate_item() as well.
diff mbox series

Patch

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index d15df49c61a86a4188b822b05453428e444920b5..a4225ad043216e5d7035a71eab6bcc49b242836f 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,12 +13,13 @@ 
 #include "volumes.h"
 #include "print-tree.h"
 
-static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
 					       struct btrfs_path *path,
 					       const struct btrfs_key *oldkey,
 					       u64 newlen, u64 frontpad)
 {
-	struct btrfs_stripe_extent *extent;
+	struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
+	struct btrfs_stripe_extent *extent, *new;
 	struct extent_buffer *leaf;
 	int slot;
 	size_t item_size;
@@ -27,6 +28,7 @@  static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
 		.type = BTRFS_RAID_STRIPE_KEY,
 		.offset = newlen,
 	};
+	int ret;
 
 	ASSERT(newlen > 0);
 	ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
@@ -34,17 +36,31 @@  static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 	item_size = btrfs_item_size(leaf, slot);
+
+	new = kzalloc(item_size, GFP_NOFS);
+	if (!new)
+		return -ENOMEM;
+
 	extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
 
 	for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
 		struct btrfs_raid_stride *stride = &extent->strides[i];
 		u64 phys;
 
-		phys = btrfs_raid_stride_physical(leaf, stride);
-		btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+		phys = btrfs_raid_stride_physical(leaf, stride) + frontpad;
+		btrfs_set_stack_raid_stride_physical(&new->strides[i], phys);
 	}
 
-	btrfs_set_item_key_safe(trans, path, &newkey);
+	ret = btrfs_del_item(trans, stripe_root, path);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(path);
+	ret = btrfs_insert_item(trans, stripe_root, &newkey, new, item_size);
+
+out:
+	kfree(new);
+	return ret;
 }
 
 int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)