diff mbox series

btrfs: fix error propagation of split bios

Message ID db5c272fc27c59ddded5c691373c26458698cb1a.1728489285.git.naohiro.aota@wdc.com (mailing list archive)
State New
Headers show
Series btrfs: fix error propagation of split bios | expand

Commit Message

Naohiro Aota Oct. 9, 2024, 3:57 p.m. UTC
The purpose of btrfs_bbio_propagate_error() shall be propagating an error
of split bio to its original btrfs_bio, and tell the error to the upper
layer. However, it's not working well on some cases.

* Case 1. Immediate (or quick) end_bio with an error

When btrfs sends btrfs_bio to mirrored devices, btrfs calls
btrfs_bio_end_io() when all the mirroring bios are completed. If that
btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function
is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
accesses the orig_bbio's bio context to increase the error count.

That works well in most cases. However, if the end_io is called enough
fast, orig_bbio's bio context may not be properly set at that time. Since
the bio context is set when the orig_bbio (the last btrfs_bio) is sent to
devices, that might be too late for earlier split btrfs_bio's completion.
That will result in NULL pointer dereference.

That bug is easily reproducible by running btrfs/146 on zoned devices and
it shows the following trace.

    [   20.923980][   T13] BUG: kernel NULL pointer dereference, address: 0000000000000020
    [   20.925234][   T13] #PF: supervisor read access in kernel mode
    [   20.926122][   T13] #PF: error_code(0x0000) - not-present page
    [   20.927118][   T13] PGD 0 P4D 0
    [   20.927607][   T13] Oops: Oops: 0000 [#1] PREEMPT SMP PTI
    [   20.928424][   T13] CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
    [   20.929740][   T13] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
    [   20.930697][   T13] Workqueue: writeback wb_workfn (flush-btrfs-5)
    [   20.931643][   T13] RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
    [   20.932573][ T1415] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
    [   20.932871][   T13] Code: ba e1 48 8b 7b 10 e8 f1 f5 f6 ff eb da 48 81 bf 10 01 00 00 40 0c 33 a0 74 09 40 88 b5 f1 00 00 00 eb b8 48 8b 85 18 01 00 00 <48> 8b 40 20 0f b7 50 24 f0 01 50 20 eb a3 0f 1f 40 00 90 90 90 90
    [   20.936623][   T13] RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
    [   20.937543][   T13] RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc
    [   20.938788][   T13] RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080
    [   20.940016][   T13] RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001
    [   20.941227][   T13] R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58
    [   20.942375][   T13] R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158
    [   20.943531][   T13] FS:  0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000
    [   20.944838][   T13] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [   20.945811][   T13] CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0
    [   20.946984][   T13] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    [   20.948150][   T13] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    [   20.949327][   T13] Call Trace:
    [   20.949949][   T13]  <TASK>
    [   20.950374][   T13]  ? __die_body.cold+0x19/0x26
    [   20.951066][   T13]  ? page_fault_oops+0x13e/0x2b0
    [   20.951766][   T13]  ? _printk+0x58/0x73
    [   20.952358][   T13]  ? do_user_addr_fault+0x5f/0x750
    [   20.953120][   T13]  ? exc_page_fault+0x76/0x240
    [   20.953827][   T13]  ? asm_exc_page_fault+0x22/0x30
    [   20.954606][   T13]  ? btrfs_bio_end_io+0xae/0xc0 [btrfs]
    [   20.955616][   T13]  ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
    [   20.956682][   T13]  btrfs_orig_write_end_io+0x51/0x90 [btrfs]
    [   20.957769][   T13]  dm_submit_bio+0x5c2/0xa50 [dm_mod]
    [   20.958623][   T13]  ? find_held_lock+0x2b/0x80
    [   20.959339][   T13]  ? blk_try_enter_queue+0x90/0x1e0
    [   20.960228][   T13]  __submit_bio+0xe0/0x130
    [   20.960879][   T13]  ? ktime_get+0x10a/0x160
    [   20.961546][   T13]  ? lockdep_hardirqs_on+0x74/0x100
    [   20.962310][   T13]  submit_bio_noacct_nocheck+0x199/0x410
    [   20.963140][   T13]  btrfs_submit_bio+0x7d/0x150 [btrfs]
    [   20.964089][   T13]  btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
    [   20.965066][   T13]  ? lockdep_hardirqs_on+0x74/0x100
    [   20.965824][   T13]  ? __folio_start_writeback+0x10/0x2c0
    [   20.966659][   T13]  btrfs_submit_bbio+0x1c/0x40 [btrfs]
    [   20.967617][   T13]  submit_one_bio+0x44/0x60 [btrfs]
    [   20.968536][   T13]  submit_extent_folio+0x13f/0x330 [btrfs]
    [   20.969552][   T13]  ? btrfs_set_range_writeback+0xa3/0xd0 [btrfs]
    [   20.970625][   T13]  extent_writepage_io+0x18b/0x360 [btrfs]
    [   20.971632][   T13]  extent_write_locked_range+0x17c/0x340 [btrfs]
    [   20.972702][   T13]  ? __pfx_end_bbio_data_write+0x10/0x10 [btrfs]
    [   20.973857][   T13]  run_delalloc_cow+0x71/0xd0 [btrfs]
    [   20.974841][   T13]  btrfs_run_delalloc_range+0x176/0x500 [btrfs]
    [   20.975870][   T13]  ? find_lock_delalloc_range+0x119/0x260 [btrfs]
    [   20.976911][   T13]  writepage_delalloc+0x2ab/0x480 [btrfs]
    [   20.977792][   T13]  extent_write_cache_pages+0x236/0x7d0 [btrfs]
    [   20.978728][   T13]  btrfs_writepages+0x72/0x130 [btrfs]
    [   20.979531][   T13]  do_writepages+0xd4/0x240
    [   20.980111][   T13]  ? find_held_lock+0x2b/0x80
    [   20.980695][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
    [   20.981461][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
    [   20.982213][   T13]  __writeback_single_inode+0x5c/0x4c0
    [   20.982859][   T13]  ? do_raw_spin_unlock+0x49/0xb0
    [   20.983439][   T13]  writeback_sb_inodes+0x22c/0x560
    [   20.984079][   T13]  __writeback_inodes_wb+0x4c/0xe0
    [   20.984886][   T13]  wb_writeback+0x1d6/0x3f0
    [   20.985536][   T13]  wb_workfn+0x334/0x520
    [   20.986044][   T13]  process_one_work+0x1ee/0x570
    [   20.986580][   T13]  ? lock_is_held_type+0xc6/0x130
    [   20.987142][   T13]  worker_thread+0x1d1/0x3b0
    [   20.987918][   T13]  ? __pfx_worker_thread+0x10/0x10
    [   20.988690][   T13]  kthread+0xee/0x120
    [   20.989180][   T13]  ? __pfx_kthread+0x10/0x10
    [   20.989915][   T13]  ret_from_fork+0x30/0x50
    [   20.990615][   T13]  ? __pfx_kthread+0x10/0x10
    [   20.991336][   T13]  ret_from_fork_asm+0x1a/0x30
    [   20.992106][   T13]  </TASK>
    [   20.992482][   T13] Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl
    [   20.993406][   T13] CR2: 0000000000000020
    [   20.993884][   T13] ---[ end trace 0000000000000000 ]---
    [   20.993954][ T1415] BUG: kernel NULL pointer dereference, address: 0000000000000020

* Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios

btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
called last among split bios. In that case, btrfs_orig_write_end_io() sets
the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [1].
Otherwise, the increased orig_bio's bioc->error is not checked by anyone
and return BLK_STS_OK to the upper layer.

[1] Actually, this is not true. Because we only increases orig_bioc->errors
by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors"
is still not met if only one split btrfs_bio fails.

* Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios

In contrast to the above case, btrfs_bbio_propagate_error() is not working
well if un-mirrored orig_bbio is completed last. It sets
orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
over-written by orig_bbio's completion status. If the status is BLK_STS_OK,
the upper layer would not know the failure.

* Solution

Considering the above cases, we can only save the error status in the
orig_bbio itself as it is always available. Also, the saved error status
should be propagated when all the split btrfs_bios are finished (i.e,
bbio->pending_ios == 0).

This commit introduces "status" to btrfs_bbio and uses the last saved error
status for bbio->bio.bi_status.

With this commit, btrfs/146 on zoned devices does not hit the NULL pointer
dereference.

Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
CC: stable@vger.kernel.org # 6.6+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/bio.c | 33 +++++++++------------------------
 fs/btrfs/bio.h |  3 +++
 2 files changed, 12 insertions(+), 24 deletions(-)

Comments

David Sterba Oct. 9, 2024, 4:59 p.m. UTC | #1
On Thu, Oct 10, 2024 at 12:57:50AM +0900, Naohiro Aota wrote:
> The purpose of btrfs_bbio_propagate_error() shall be propagating an error
> of split bio to its original btrfs_bio, and tell the error to the upper
> layer. However, it's not working well on some cases.
> 
> * Case 1. Immediate (or quick) end_bio with an error
> 
> When btrfs sends btrfs_bio to mirrored devices, btrfs calls
> btrfs_bio_end_io() when all the mirroring bios are completed. If that
> btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function
> is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
> accesses the orig_bbio's bio context to increase the error count.
> 
> That works well in most cases. However, if the end_io is called enough
> fast, orig_bbio's bio context may not be properly set at that time. Since
> the bio context is set when the orig_bbio (the last btrfs_bio) is sent to
> devices, that might be too late for earlier split btrfs_bio's completion.
> That will result in NULL pointer dereference.
> 
> That bug is easily reproducible by running btrfs/146 on zoned devices and
> it shows the following trace.
> 
>     [   20.923980][   T13] BUG: kernel NULL pointer dereference, address: 0000000000000020
>     [   20.925234][   T13] #PF: supervisor read access in kernel mode
>     [   20.926122][   T13] #PF: error_code(0x0000) - not-present page
>     [   20.927118][   T13] PGD 0 P4D 0
>     [   20.927607][   T13] Oops: Oops: 0000 [#1] PREEMPT SMP PTI
>     [   20.928424][   T13] CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
>     [   20.929740][   T13] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>     [   20.930697][   T13] Workqueue: writeback wb_workfn (flush-btrfs-5)
>     [   20.931643][   T13] RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
>     [   20.932573][ T1415] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
>     [   20.932871][   T13] Code: ba e1 48 8b 7b 10 e8 f1 f5 f6 ff eb da 48 81 bf 10 01 00 00 40 0c 33 a0 74 09 40 88 b5 f1 00 00 00 eb b8 48 8b 85 18 01 00 00 <48> 8b 40 20 0f b7 50 24 f0 01 50 20 eb a3 0f 1f 40 00 90 90 90 90
>     [   20.936623][   T13] RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
>     [   20.937543][   T13] RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc
>     [   20.938788][   T13] RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080
>     [   20.940016][   T13] RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001
>     [   20.941227][   T13] R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58
>     [   20.942375][   T13] R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158
>     [   20.943531][   T13] FS:  0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000
>     [   20.944838][   T13] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>     [   20.945811][   T13] CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0
>     [   20.946984][   T13] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>     [   20.948150][   T13] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>     [   20.949327][   T13] Call Trace:
>     [   20.949949][   T13]  <TASK>
>     [   20.950374][   T13]  ? __die_body.cold+0x19/0x26
>     [   20.951066][   T13]  ? page_fault_oops+0x13e/0x2b0
>     [   20.951766][   T13]  ? _printk+0x58/0x73
>     [   20.952358][   T13]  ? do_user_addr_fault+0x5f/0x750
>     [   20.953120][   T13]  ? exc_page_fault+0x76/0x240
>     [   20.953827][   T13]  ? asm_exc_page_fault+0x22/0x30
>     [   20.954606][   T13]  ? btrfs_bio_end_io+0xae/0xc0 [btrfs]
>     [   20.955616][   T13]  ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
>     [   20.956682][   T13]  btrfs_orig_write_end_io+0x51/0x90 [btrfs]
>     [   20.957769][   T13]  dm_submit_bio+0x5c2/0xa50 [dm_mod]
>     [   20.958623][   T13]  ? find_held_lock+0x2b/0x80
>     [   20.959339][   T13]  ? blk_try_enter_queue+0x90/0x1e0
>     [   20.960228][   T13]  __submit_bio+0xe0/0x130
>     [   20.960879][   T13]  ? ktime_get+0x10a/0x160
>     [   20.961546][   T13]  ? lockdep_hardirqs_on+0x74/0x100
>     [   20.962310][   T13]  submit_bio_noacct_nocheck+0x199/0x410
>     [   20.963140][   T13]  btrfs_submit_bio+0x7d/0x150 [btrfs]
>     [   20.964089][   T13]  btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
>     [   20.965066][   T13]  ? lockdep_hardirqs_on+0x74/0x100
>     [   20.965824][   T13]  ? __folio_start_writeback+0x10/0x2c0
>     [   20.966659][   T13]  btrfs_submit_bbio+0x1c/0x40 [btrfs]
>     [   20.967617][   T13]  submit_one_bio+0x44/0x60 [btrfs]
>     [   20.968536][   T13]  submit_extent_folio+0x13f/0x330 [btrfs]
>     [   20.969552][   T13]  ? btrfs_set_range_writeback+0xa3/0xd0 [btrfs]
>     [   20.970625][   T13]  extent_writepage_io+0x18b/0x360 [btrfs]
>     [   20.971632][   T13]  extent_write_locked_range+0x17c/0x340 [btrfs]
>     [   20.972702][   T13]  ? __pfx_end_bbio_data_write+0x10/0x10 [btrfs]
>     [   20.973857][   T13]  run_delalloc_cow+0x71/0xd0 [btrfs]
>     [   20.974841][   T13]  btrfs_run_delalloc_range+0x176/0x500 [btrfs]
>     [   20.975870][   T13]  ? find_lock_delalloc_range+0x119/0x260 [btrfs]
>     [   20.976911][   T13]  writepage_delalloc+0x2ab/0x480 [btrfs]
>     [   20.977792][   T13]  extent_write_cache_pages+0x236/0x7d0 [btrfs]
>     [   20.978728][   T13]  btrfs_writepages+0x72/0x130 [btrfs]
>     [   20.979531][   T13]  do_writepages+0xd4/0x240
>     [   20.980111][   T13]  ? find_held_lock+0x2b/0x80
>     [   20.980695][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
>     [   20.981461][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
>     [   20.982213][   T13]  __writeback_single_inode+0x5c/0x4c0
>     [   20.982859][   T13]  ? do_raw_spin_unlock+0x49/0xb0
>     [   20.983439][   T13]  writeback_sb_inodes+0x22c/0x560
>     [   20.984079][   T13]  __writeback_inodes_wb+0x4c/0xe0
>     [   20.984886][   T13]  wb_writeback+0x1d6/0x3f0
>     [   20.985536][   T13]  wb_workfn+0x334/0x520
>     [   20.986044][   T13]  process_one_work+0x1ee/0x570
>     [   20.986580][   T13]  ? lock_is_held_type+0xc6/0x130
>     [   20.987142][   T13]  worker_thread+0x1d1/0x3b0
>     [   20.987918][   T13]  ? __pfx_worker_thread+0x10/0x10
>     [   20.988690][   T13]  kthread+0xee/0x120
>     [   20.989180][   T13]  ? __pfx_kthread+0x10/0x10
>     [   20.989915][   T13]  ret_from_fork+0x30/0x50
>     [   20.990615][   T13]  ? __pfx_kthread+0x10/0x10
>     [   20.991336][   T13]  ret_from_fork_asm+0x1a/0x30
>     [   20.992106][   T13]  </TASK>
>     [   20.992482][   T13] Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl
>     [   20.993406][   T13] CR2: 0000000000000020
>     [   20.993884][   T13] ---[ end trace 0000000000000000 ]---
>     [   20.993954][ T1415] BUG: kernel NULL pointer dereference, address: 0000000000000020
> 
> * Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios
> 
> btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
> called last among split bios. In that case, btrfs_orig_write_end_io() sets
> the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [1].
> Otherwise, the increased orig_bio's bioc->error is not checked by anyone
> and return BLK_STS_OK to the upper layer.
> 
> [1] Actually, this is not true. Because we only increases orig_bioc->errors
> by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors"
> is still not met if only one split btrfs_bio fails.
> 
> * Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios
> 
> In contrast to the above case, btrfs_bbio_propagate_error() is not working
> well if un-mirrored orig_bbio is completed last. It sets
> orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
> over-written by orig_bbio's completion status. If the status is BLK_STS_OK,
> the upper layer would not know the failure.
> 
> * Solution
> 
> Considering the above cases, we can only save the error status in the
> orig_bbio itself as it is always available. Also, the saved error status
> should be propagated when all the split btrfs_bios are finished (i.e,
> bbio->pending_ios == 0).
> 
> This commit introduces "status" to btrfs_bbio and uses the last saved error
> status for bbio->bio.bi_status.
> 
> With this commit, btrfs/146 on zoned devices does not hit the NULL pointer
> dereference.
> 
> Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
> CC: stable@vger.kernel.org # 6.6+
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>  fs/btrfs/bio.c | 33 +++++++++------------------------
>  fs/btrfs/bio.h |  3 +++
>  2 files changed, 12 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index 056f8a171bba..a43d88bdcae7 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
>  	bbio->end_io = end_io;
>  	bbio->private = private;
>  	atomic_set(&bbio->pending_ios, 1);
> +	atomic_set(&bbio->status, BLK_STS_OK);
>  }
>  
>  /*
> @@ -120,41 +121,25 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
>  	}
>  }
>  
> -static void btrfs_orig_write_end_io(struct bio *bio);
> -
> -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
> -				       struct btrfs_bio *orig_bbio)
> -{
> -	/*
> -	 * For writes we tolerate nr_mirrors - 1 write failures, so we can't
> -	 * just blindly propagate a write failure here.  Instead increment the
> -	 * error count in the original I/O context so that it is guaranteed to
> -	 * be larger than the error tolerance.
> -	 */
> -	if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
> -		struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
> -		struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
> -
> -		atomic_add(orig_bioc->max_errors, &orig_bioc->error);
> -	} else {
> -		orig_bbio->bio.bi_status = bbio->bio.bi_status;
> -	}
> -}
> -
>  void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
>  {
>  	bbio->bio.bi_status = status;
>  	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
>  		struct btrfs_bio *orig_bbio = bbio->private;
>  
> -		if (bbio->bio.bi_status)
> -			btrfs_bbio_propagate_error(bbio, orig_bbio);
> +		/* Save the last error. */
> +		if (bbio->bio.bi_status != BLK_STS_OK)
> +			atomic_set(&orig_bbio->status, bbio->bio.bi_status);
>  		btrfs_cleanup_bio(bbio);
>  		bbio = orig_bbio;
>  	}
>  
> -	if (atomic_dec_and_test(&bbio->pending_ios))
> +	if (atomic_dec_and_test(&bbio->pending_ios)) {
> +		/* Load split bio's error which might be set above. */
> +		if (status == BLK_STS_OK)
> +			bbio->bio.bi_status = atomic_read(&bbio->status);
>  		__btrfs_bio_end_io(bbio);
> +	}
>  }
>  
>  static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
> diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
> index e48612340745..b8f7f6071bc2 100644
> --- a/fs/btrfs/bio.h
> +++ b/fs/btrfs/bio.h
> @@ -79,6 +79,9 @@ struct btrfs_bio {
>  	/* File system that this I/O operates on. */
>  	struct btrfs_fs_info *fs_info;
>  
> +	/* Set the error status of split bio. */
> +	atomic_t status;

To repeat my comments from slack here. This should not be atomic when
it's using only set and read, a plain int or blk_sts_t.

The logic of storing the last error in btrfs_bio makes sense, I don't
see other ways to do it. If there are multiple errors we can store the
first one or the last one, we'd always lose some information. When it's
the first one it could be set by cmpxchg.
Qu Wenruo Oct. 9, 2024, 9:38 p.m. UTC | #2
在 2024/10/10 02:27, Naohiro Aota 写道:
> The purpose of btrfs_bbio_propagate_error() shall be propagating an error
> of split bio to its original btrfs_bio, and tell the error to the upper
> layer. However, it's not working well on some cases.
>
> * Case 1. Immediate (or quick) end_bio with an error
>
> When btrfs sends btrfs_bio to mirrored devices, btrfs calls
> btrfs_bio_end_io() when all the mirroring bios are completed. If that
> btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function
> is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
> accesses the orig_bbio's bio context to increase the error count.
>
> That works well in most cases. However, if the end_io is called enough
> fast, orig_bbio's bio context may not be properly set at that time. Since
> the bio context is set when the orig_bbio (the last btrfs_bio) is sent to
> devices, that might be too late for earlier split btrfs_bio's completion.
> That will result in NULL pointer dereference.

Mind to share more info on how the NULL pointer dereference happened?

btrfs_split_bio() should always initialize the private pointer of the
new bio to point to the original one.

Thus I didn't see an immediate problem with this.


>
> That bug is easily reproducible by running btrfs/146 on zoned devices and
> it shows the following trace.

Furthermore, IIRC zoned device only supports SINGLE/DUP/RAID1 for data
without RST.

Then there should be split happening, but only mirrored writes.
Thus it looks like the description doesn't really match the symptom.


>
>      [   20.923980][   T13] BUG: kernel NULL pointer dereference, address: 0000000000000020
>      [   20.925234][   T13] #PF: supervisor read access in kernel mode
>      [   20.926122][   T13] #PF: error_code(0x0000) - not-present page
>      [   20.927118][   T13] PGD 0 P4D 0
>      [   20.927607][   T13] Oops: Oops: 0000 [#1] PREEMPT SMP PTI
>      [   20.928424][   T13] CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
>      [   20.929740][   T13] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>      [   20.930697][   T13] Workqueue: writeback wb_workfn (flush-btrfs-5)
>      [   20.931643][   T13] RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
>      [   20.932573][ T1415] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
>      [   20.932871][   T13] Code: ba e1 48 8b 7b 10 e8 f1 f5 f6 ff eb da 48 81 bf 10 01 00 00 40 0c 33 a0 74 09 40 88 b5 f1 00 00 00 eb b8 48 8b 85 18 01 00 00 <48> 8b 40 20 0f b7 50 24 f0 01 50 20 eb a3 0f 1f 40 00 90 90 90 90
>      [   20.936623][   T13] RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
>      [   20.937543][   T13] RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc
>      [   20.938788][   T13] RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080
>      [   20.940016][   T13] RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001
>      [   20.941227][   T13] R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58
>      [   20.942375][   T13] R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158
>      [   20.943531][   T13] FS:  0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000
>      [   20.944838][   T13] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>      [   20.945811][   T13] CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0
>      [   20.946984][   T13] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>      [   20.948150][   T13] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>      [   20.949327][   T13] Call Trace:
>      [   20.949949][   T13]  <TASK>
>      [   20.950374][   T13]  ? __die_body.cold+0x19/0x26
>      [   20.951066][   T13]  ? page_fault_oops+0x13e/0x2b0
>      [   20.951766][   T13]  ? _printk+0x58/0x73
>      [   20.952358][   T13]  ? do_user_addr_fault+0x5f/0x750
>      [   20.953120][   T13]  ? exc_page_fault+0x76/0x240
>      [   20.953827][   T13]  ? asm_exc_page_fault+0x22/0x30
>      [   20.954606][   T13]  ? btrfs_bio_end_io+0xae/0xc0 [btrfs]
>      [   20.955616][   T13]  ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
>      [   20.956682][   T13]  btrfs_orig_write_end_io+0x51/0x90 [btrfs]
>      [   20.957769][   T13]  dm_submit_bio+0x5c2/0xa50 [dm_mod]
>      [   20.958623][   T13]  ? find_held_lock+0x2b/0x80
>      [   20.959339][   T13]  ? blk_try_enter_queue+0x90/0x1e0
>      [   20.960228][   T13]  __submit_bio+0xe0/0x130
>      [   20.960879][   T13]  ? ktime_get+0x10a/0x160
>      [   20.961546][   T13]  ? lockdep_hardirqs_on+0x74/0x100
>      [   20.962310][   T13]  submit_bio_noacct_nocheck+0x199/0x410
>      [   20.963140][   T13]  btrfs_submit_bio+0x7d/0x150 [btrfs]
>      [   20.964089][   T13]  btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
>      [   20.965066][   T13]  ? lockdep_hardirqs_on+0x74/0x100
>      [   20.965824][   T13]  ? __folio_start_writeback+0x10/0x2c0
>      [   20.966659][   T13]  btrfs_submit_bbio+0x1c/0x40 [btrfs]
>      [   20.967617][   T13]  submit_one_bio+0x44/0x60 [btrfs]
>      [   20.968536][   T13]  submit_extent_folio+0x13f/0x330 [btrfs]
>      [   20.969552][   T13]  ? btrfs_set_range_writeback+0xa3/0xd0 [btrfs]
>      [   20.970625][   T13]  extent_writepage_io+0x18b/0x360 [btrfs]
>      [   20.971632][   T13]  extent_write_locked_range+0x17c/0x340 [btrfs]
>      [   20.972702][   T13]  ? __pfx_end_bbio_data_write+0x10/0x10 [btrfs]
>      [   20.973857][   T13]  run_delalloc_cow+0x71/0xd0 [btrfs]
>      [   20.974841][   T13]  btrfs_run_delalloc_range+0x176/0x500 [btrfs]
>      [   20.975870][   T13]  ? find_lock_delalloc_range+0x119/0x260 [btrfs]
>      [   20.976911][   T13]  writepage_delalloc+0x2ab/0x480 [btrfs]
>      [   20.977792][   T13]  extent_write_cache_pages+0x236/0x7d0 [btrfs]
>      [   20.978728][   T13]  btrfs_writepages+0x72/0x130 [btrfs]
>      [   20.979531][   T13]  do_writepages+0xd4/0x240
>      [   20.980111][   T13]  ? find_held_lock+0x2b/0x80
>      [   20.980695][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
>      [   20.981461][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
>      [   20.982213][   T13]  __writeback_single_inode+0x5c/0x4c0
>      [   20.982859][   T13]  ? do_raw_spin_unlock+0x49/0xb0
>      [   20.983439][   T13]  writeback_sb_inodes+0x22c/0x560
>      [   20.984079][   T13]  __writeback_inodes_wb+0x4c/0xe0
>      [   20.984886][   T13]  wb_writeback+0x1d6/0x3f0
>      [   20.985536][   T13]  wb_workfn+0x334/0x520
>      [   20.986044][   T13]  process_one_work+0x1ee/0x570
>      [   20.986580][   T13]  ? lock_is_held_type+0xc6/0x130
>      [   20.987142][   T13]  worker_thread+0x1d1/0x3b0
>      [   20.987918][   T13]  ? __pfx_worker_thread+0x10/0x10
>      [   20.988690][   T13]  kthread+0xee/0x120
>      [   20.989180][   T13]  ? __pfx_kthread+0x10/0x10
>      [   20.989915][   T13]  ret_from_fork+0x30/0x50
>      [   20.990615][   T13]  ? __pfx_kthread+0x10/0x10
>      [   20.991336][   T13]  ret_from_fork_asm+0x1a/0x30
>      [   20.992106][   T13]  </TASK>
>      [   20.992482][   T13] Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl
>      [   20.993406][   T13] CR2: 0000000000000020
>      [   20.993884][   T13] ---[ end trace 0000000000000000 ]---
>      [   20.993954][ T1415] BUG: kernel NULL pointer dereference, address: 0000000000000020
>
> * Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios
>
> btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
> called last among split bios. In that case, btrfs_orig_write_end_io() sets
> the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [1].
> Otherwise, the increased orig_bio's bioc->error is not checked by anyone
> and return BLK_STS_OK to the upper layer.

That doesn't sound correct to me.

The original bio always got its bio->__bi_remaining increased when it
get cloned.

And __bi_remaining is only decreased when the cloned or the original bio
get its endio function called (bio_endio()).

For cloned bios, it's mostly the same chained bio behavior, with extra
btrfs write tolerance added.

So the explanation doesn't look correct to me.

>
> [1] Actually, this is not true. Because we only increases orig_bioc->errors
> by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors"
> is still not met if only one split btrfs_bio fails.
>
> * Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios
>
> In contrast to the above case, btrfs_bbio_propagate_error() is not working
> well if un-mirrored orig_bbio is completed last. It sets
> orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
> over-written by orig_bbio's completion status. If the status is BLK_STS_OK,
> the upper layer would not know the failure.
>
> * Solution
>
> Considering the above cases, we can only save the error status in the
> orig_bbio itself as it is always available. Also, the saved error status
> should be propagated when all the split btrfs_bios are finished (i.e,
> bbio->pending_ios == 0).

This looks like it will not handle the write error tolerance at all.

If there is really some timing related problem, I'd recommend to queue
all the split bios into a bio_list, and submit them all when all splits
is done.

Otherwise we will not tolerate any write failures.

>
> This commit introduces "status" to btrfs_bbio and uses the last saved error
> status for bbio->bio.bi_status.
>
> With this commit, btrfs/146 on zoned devices does not hit the NULL pointer
> dereference.
>
> Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
> CC: stable@vger.kernel.org # 6.6+
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>   fs/btrfs/bio.c | 33 +++++++++------------------------
>   fs/btrfs/bio.h |  3 +++
>   2 files changed, 12 insertions(+), 24 deletions(-)
>
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index 056f8a171bba..a43d88bdcae7 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
>   	bbio->end_io = end_io;
>   	bbio->private = private;
>   	atomic_set(&bbio->pending_ios, 1);
> +	atomic_set(&bbio->status, BLK_STS_OK);
>   }
>
>   /*
> @@ -120,41 +121,25 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
>   	}
>   }
>
> -static void btrfs_orig_write_end_io(struct bio *bio);
> -
> -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
> -				       struct btrfs_bio *orig_bbio)
> -{
> -	/*
> -	 * For writes we tolerate nr_mirrors - 1 write failures, so we can't
> -	 * just blindly propagate a write failure here.  Instead increment the
> -	 * error count in the original I/O context so that it is guaranteed to
> -	 * be larger than the error tolerance.
> -	 */
> -	if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
> -		struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
> -		struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
> -
> -		atomic_add(orig_bioc->max_errors, &orig_bioc->error);
> -	} else {
> -		orig_bbio->bio.bi_status = bbio->bio.bi_status;
> -	}
> -}
> -
>   void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
>   {
>   	bbio->bio.bi_status = status;
>   	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
>   		struct btrfs_bio *orig_bbio = bbio->private;
>
> -		if (bbio->bio.bi_status)
> -			btrfs_bbio_propagate_error(bbio, orig_bbio);
> +		/* Save the last error. */
> +		if (bbio->bio.bi_status != BLK_STS_OK)
> +			atomic_set(&orig_bbio->status, bbio->bio.bi_status);
>   		btrfs_cleanup_bio(bbio);
>   		bbio = orig_bbio;
>   	}
>
> -	if (atomic_dec_and_test(&bbio->pending_ios))
> +	if (atomic_dec_and_test(&bbio->pending_ios)) {
> +		/* Load split bio's error which might be set above. */
> +		if (status == BLK_STS_OK)
> +			bbio->bio.bi_status = atomic_read(&bbio->status);
>   		__btrfs_bio_end_io(bbio);
> +	}
>   }
>
>   static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
> diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
> index e48612340745..b8f7f6071bc2 100644
> --- a/fs/btrfs/bio.h
> +++ b/fs/btrfs/bio.h
> @@ -79,6 +79,9 @@ struct btrfs_bio {
>   	/* File system that this I/O operates on. */
>   	struct btrfs_fs_info *fs_info;
>
> +	/* Set the error status of split bio. */
> +	atomic_t status;
> +

The same as David, please do not abuse atomic_t for this purpose.

Thanks,
Qu

>   	/*
>   	 * This member must come last, bio_alloc_bioset will allocate enough
>   	 * bytes for entire btrfs_bio but relies on bio being last.
Qu Wenruo Oct. 9, 2024, 9:40 p.m. UTC | #3
在 2024/10/10 08:08, Qu Wenruo 写道:
> 
> 
> 在 2024/10/10 02:27, Naohiro Aota 写道:
>> The purpose of btrfs_bbio_propagate_error() shall be propagating an error
>> of split bio to its original btrfs_bio, and tell the error to the upper
>> layer. However, it's not working well on some cases.
>>
>> * Case 1. Immediate (or quick) end_bio with an error
>>
>> When btrfs sends btrfs_bio to mirrored devices, btrfs calls
>> btrfs_bio_end_io() when all the mirroring bios are completed. If that
>> btrfs_bio was split, it is from btrfs_clone_bioset and its end_io 
>> function
>> is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
>> accesses the orig_bbio's bio context to increase the error count.
>>
>> That works well in most cases. However, if the end_io is called enough
>> fast, orig_bbio's bio context may not be properly set at that time. Since
>> the bio context is set when the orig_bbio (the last btrfs_bio) is sent to
>> devices, that might be too late for earlier split btrfs_bio's completion.
>> That will result in NULL pointer dereference.
> 
> Mind to share more info on how the NULL pointer dereference happened?
> 
> btrfs_split_bio() should always initialize the private pointer of the
> new bio to point to the original one.
> 
> Thus I didn't see an immediate problem with this.
> 
> 
>>
>> That bug is easily reproducible by running btrfs/146 on zoned devices and
>> it shows the following trace.
> 
> Furthermore, IIRC zoned device only supports SINGLE/DUP/RAID1 for data
> without RST.
> 
> Then there should be split happening, but only mirrored writes.

s/should be/should be no/

> Thus it looks like the description doesn't really match the symptom.
> 
> 
>>
>>      [   20.923980][   T13] BUG: kernel NULL pointer dereference, 
>> address: 0000000000000020
>>      [   20.925234][   T13] #PF: supervisor read access in kernel mode
>>      [   20.926122][   T13] #PF: error_code(0x0000) - not-present page
>>      [   20.927118][   T13] PGD 0 P4D 0
>>      [   20.927607][   T13] Oops: Oops: 0000 [#1] PREEMPT SMP PTI
>>      [   20.928424][   T13] CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 
>> Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
>>      [   20.929740][   T13] Hardware name: Bochs Bochs, BIOS Bochs 
>> 01/01/2011
>>      [   20.930697][   T13] Workqueue: writeback wb_workfn (flush- 
>> btrfs-5)
>>      [   20.931643][   T13] RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
>>      [   20.932573][ T1415] BTRFS error (device dm-0): bdev /dev/ 
>> mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
>>      [   20.932871][   T13] Code: ba e1 48 8b 7b 10 e8 f1 f5 f6 ff eb 
>> da 48 81 bf 10 01 00 00 40 0c 33 a0 74 09 40 88 b5 f1 00 00 00 eb b8 
>> 48 8b 85 18 01 00 00 <48> 8b 40 20 0f b7 50 24 f0 01 50 20 eb a3 0f 1f 
>> 40 00 90 90 90 90
>>      [   20.936623][   T13] RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
>>      [   20.937543][   T13] RAX: 0000000000000000 RBX: 
>> ffff888005a7f080 RCX: ffffc9000006f1dc
>>      [   20.938788][   T13] RDX: 0000000000000000 RSI: 
>> 000000000000000a RDI: ffff888005a7f080
>>      [   20.940016][   T13] RBP: ffff888011dfc540 R08: 
>> 0000000000000000 R09: 0000000000000001
>>      [   20.941227][   T13] R10: ffffffff82e508e0 R11: 
>> 0000000000000005 R12: ffff88800ddfbe58
>>      [   20.942375][   T13] R13: ffff888005a7f080 R14: 
>> ffff888005a7f158 R15: ffff888005a7f158
>>      [   20.943531][   T13] FS:  0000000000000000(0000) 
>> GS:ffff88803ea80000(0000) knlGS:0000000000000000
>>      [   20.944838][   T13] CS:  0010 DS: 0000 ES: 0000 CR0: 
>> 0000000080050033
>>      [   20.945811][   T13] CR2: 0000000000000020 CR3: 
>> 0000000002e22006 CR4: 0000000000370ef0
>>      [   20.946984][   T13] DR0: 0000000000000000 DR1: 
>> 0000000000000000 DR2: 0000000000000000
>>      [   20.948150][   T13] DR3: 0000000000000000 DR6: 
>> 00000000fffe0ff0 DR7: 0000000000000400
>>      [   20.949327][   T13] Call Trace:
>>      [   20.949949][   T13]  <TASK>
>>      [   20.950374][   T13]  ? __die_body.cold+0x19/0x26
>>      [   20.951066][   T13]  ? page_fault_oops+0x13e/0x2b0
>>      [   20.951766][   T13]  ? _printk+0x58/0x73
>>      [   20.952358][   T13]  ? do_user_addr_fault+0x5f/0x750
>>      [   20.953120][   T13]  ? exc_page_fault+0x76/0x240
>>      [   20.953827][   T13]  ? asm_exc_page_fault+0x22/0x30
>>      [   20.954606][   T13]  ? btrfs_bio_end_io+0xae/0xc0 [btrfs]
>>      [   20.955616][   T13]  ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
>>      [   20.956682][   T13]  btrfs_orig_write_end_io+0x51/0x90 [btrfs]
>>      [   20.957769][   T13]  dm_submit_bio+0x5c2/0xa50 [dm_mod]
>>      [   20.958623][   T13]  ? find_held_lock+0x2b/0x80
>>      [   20.959339][   T13]  ? blk_try_enter_queue+0x90/0x1e0
>>      [   20.960228][   T13]  __submit_bio+0xe0/0x130
>>      [   20.960879][   T13]  ? ktime_get+0x10a/0x160
>>      [   20.961546][   T13]  ? lockdep_hardirqs_on+0x74/0x100
>>      [   20.962310][   T13]  submit_bio_noacct_nocheck+0x199/0x410
>>      [   20.963140][   T13]  btrfs_submit_bio+0x7d/0x150 [btrfs]
>>      [   20.964089][   T13]  btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
>>      [   20.965066][   T13]  ? lockdep_hardirqs_on+0x74/0x100
>>      [   20.965824][   T13]  ? __folio_start_writeback+0x10/0x2c0
>>      [   20.966659][   T13]  btrfs_submit_bbio+0x1c/0x40 [btrfs]
>>      [   20.967617][   T13]  submit_one_bio+0x44/0x60 [btrfs]
>>      [   20.968536][   T13]  submit_extent_folio+0x13f/0x330 [btrfs]
>>      [   20.969552][   T13]  ? btrfs_set_range_writeback+0xa3/0xd0 
>> [btrfs]
>>      [   20.970625][   T13]  extent_writepage_io+0x18b/0x360 [btrfs]
>>      [   20.971632][   T13]  extent_write_locked_range+0x17c/0x340 
>> [btrfs]
>>      [   20.972702][   T13]  ? __pfx_end_bbio_data_write+0x10/0x10 
>> [btrfs]
>>      [   20.973857][   T13]  run_delalloc_cow+0x71/0xd0 [btrfs]
>>      [   20.974841][   T13]  btrfs_run_delalloc_range+0x176/0x500 [btrfs]
>>      [   20.975870][   T13]  ? find_lock_delalloc_range+0x119/0x260 
>> [btrfs]
>>      [   20.976911][   T13]  writepage_delalloc+0x2ab/0x480 [btrfs]
>>      [   20.977792][   T13]  extent_write_cache_pages+0x236/0x7d0 [btrfs]
>>      [   20.978728][   T13]  btrfs_writepages+0x72/0x130 [btrfs]
>>      [   20.979531][   T13]  do_writepages+0xd4/0x240
>>      [   20.980111][   T13]  ? find_held_lock+0x2b/0x80
>>      [   20.980695][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
>>      [   20.981461][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
>>      [   20.982213][   T13]  __writeback_single_inode+0x5c/0x4c0
>>      [   20.982859][   T13]  ? do_raw_spin_unlock+0x49/0xb0
>>      [   20.983439][   T13]  writeback_sb_inodes+0x22c/0x560
>>      [   20.984079][   T13]  __writeback_inodes_wb+0x4c/0xe0
>>      [   20.984886][   T13]  wb_writeback+0x1d6/0x3f0
>>      [   20.985536][   T13]  wb_workfn+0x334/0x520
>>      [   20.986044][   T13]  process_one_work+0x1ee/0x570
>>      [   20.986580][   T13]  ? lock_is_held_type+0xc6/0x130
>>      [   20.987142][   T13]  worker_thread+0x1d1/0x3b0
>>      [   20.987918][   T13]  ? __pfx_worker_thread+0x10/0x10
>>      [   20.988690][   T13]  kthread+0xee/0x120
>>      [   20.989180][   T13]  ? __pfx_kthread+0x10/0x10
>>      [   20.989915][   T13]  ret_from_fork+0x30/0x50
>>      [   20.990615][   T13]  ? __pfx_kthread+0x10/0x10
>>      [   20.991336][   T13]  ret_from_fork_asm+0x1a/0x30
>>      [   20.992106][   T13]  </TASK>
>>      [   20.992482][   T13] Modules linked in: dm_mod btrfs 
>> blake2b_generic xor raid6_pq rapl
>>      [   20.993406][   T13] CR2: 0000000000000020
>>      [   20.993884][   T13] ---[ end trace 0000000000000000 ]---
>>      [   20.993954][ T1415] BUG: kernel NULL pointer dereference, 
>> address: 0000000000000020
>>
>> * Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios
>>
>> btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
>> called last among split bios. In that case, btrfs_orig_write_end_io() 
>> sets
>> the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [1].
>> Otherwise, the increased orig_bio's bioc->error is not checked by anyone
>> and return BLK_STS_OK to the upper layer.
> 
> That doesn't sound correct to me.
> 
> The original bio always got its bio->__bi_remaining increased when it
> get cloned.
> 
> And __bi_remaining is only decreased when the cloned or the original bio
> get its endio function called (bio_endio()).
> 
> For cloned bios, it's mostly the same chained bio behavior, with extra
> btrfs write tolerance added.
> 
> So the explanation doesn't look correct to me.
> 
>>
>> [1] Actually, this is not true. Because we only increases orig_bioc- 
>> >errors
>> by max_errors, the condition "atomic_read(&bioc->error) > bioc- 
>> >max_errors"
>> is still not met if only one split btrfs_bio fails.
>>
>> * Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios
>>
>> In contrast to the above case, btrfs_bbio_propagate_error() is not 
>> working
>> well if un-mirrored orig_bbio is completed last. It sets
>> orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
>> over-written by orig_bbio's completion status. If the status is 
>> BLK_STS_OK,
>> the upper layer would not know the failure.
>>
>> * Solution
>>
>> Considering the above cases, we can only save the error status in the
>> orig_bbio itself as it is always available. Also, the saved error status
>> should be propagated when all the split btrfs_bios are finished (i.e,
>> bbio->pending_ios == 0).
> 
> This looks like it will not handle the write error tolerance at all.
> 
> If there is really some timing related problem, I'd recommend to queue
> all the split bios into a bio_list, and submit them all when all splits
> is done.
> 
> Otherwise we will not tolerate any write failures.
> 
>>
>> This commit introduces "status" to btrfs_bbio and uses the last saved 
>> error
>> status for bbio->bio.bi_status.
>>
>> With this commit, btrfs/146 on zoned devices does not hit the NULL 
>> pointer
>> dereference.
>>
>> Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
>> CC: stable@vger.kernel.org # 6.6+
>> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>> ---
>>   fs/btrfs/bio.c | 33 +++++++++------------------------
>>   fs/btrfs/bio.h |  3 +++
>>   2 files changed, 12 insertions(+), 24 deletions(-)
>>
>> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
>> index 056f8a171bba..a43d88bdcae7 100644
>> --- a/fs/btrfs/bio.c
>> +++ b/fs/btrfs/bio.c
>> @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct 
>> btrfs_fs_info *fs_info,
>>       bbio->end_io = end_io;
>>       bbio->private = private;
>>       atomic_set(&bbio->pending_ios, 1);
>> +    atomic_set(&bbio->status, BLK_STS_OK);
>>   }
>>
>>   /*
>> @@ -120,41 +121,25 @@ static void __btrfs_bio_end_io(struct btrfs_bio 
>> *bbio)
>>       }
>>   }
>>
>> -static void btrfs_orig_write_end_io(struct bio *bio);
>> -
>> -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
>> -                       struct btrfs_bio *orig_bbio)
>> -{
>> -    /*
>> -     * For writes we tolerate nr_mirrors - 1 write failures, so we can't
>> -     * just blindly propagate a write failure here.  Instead 
>> increment the
>> -     * error count in the original I/O context so that it is 
>> guaranteed to
>> -     * be larger than the error tolerance.
>> -     */
>> -    if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
>> -        struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
>> -        struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
>> -
>> -        atomic_add(orig_bioc->max_errors, &orig_bioc->error);
>> -    } else {
>> -        orig_bbio->bio.bi_status = bbio->bio.bi_status;
>> -    }
>> -}
>> -
>>   void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
>>   {
>>       bbio->bio.bi_status = status;
>>       if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
>>           struct btrfs_bio *orig_bbio = bbio->private;
>>
>> -        if (bbio->bio.bi_status)
>> -            btrfs_bbio_propagate_error(bbio, orig_bbio);
>> +        /* Save the last error. */
>> +        if (bbio->bio.bi_status != BLK_STS_OK)
>> +            atomic_set(&orig_bbio->status, bbio->bio.bi_status);
>>           btrfs_cleanup_bio(bbio);
>>           bbio = orig_bbio;
>>       }
>>
>> -    if (atomic_dec_and_test(&bbio->pending_ios))
>> +    if (atomic_dec_and_test(&bbio->pending_ios)) {
>> +        /* Load split bio's error which might be set above. */
>> +        if (status == BLK_STS_OK)
>> +            bbio->bio.bi_status = atomic_read(&bbio->status);
>>           __btrfs_bio_end_io(bbio);
>> +    }
>>   }
>>
>>   static int next_repair_mirror(struct btrfs_failed_bio *fbio, int 
>> cur_mirror)
>> diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
>> index e48612340745..b8f7f6071bc2 100644
>> --- a/fs/btrfs/bio.h
>> +++ b/fs/btrfs/bio.h
>> @@ -79,6 +79,9 @@ struct btrfs_bio {
>>       /* File system that this I/O operates on. */
>>       struct btrfs_fs_info *fs_info;
>>
>> +    /* Set the error status of split bio. */
>> +    atomic_t status;
>> +
> 
> The same as David, please do not abuse atomic_t for this purpose.
> 
> Thanks,
> Qu
> 
>>       /*
>>        * This member must come last, bio_alloc_bioset will allocate 
>> enough
>>        * bytes for entire btrfs_bio but relies on bio being last.
>
Qu Wenruo Oct. 9, 2024, 9:58 p.m. UTC | #4
在 2024/10/10 08:08, Qu Wenruo 写道:
>
>
> 在 2024/10/10 02:27, Naohiro Aota 写道:
>> The purpose of btrfs_bbio_propagate_error() shall be propagating an error
>> of split bio to its original btrfs_bio, and tell the error to the upper
>> layer. However, it's not working well on some cases.
>>
>> * Case 1. Immediate (or quick) end_bio with an error
>>
>> When btrfs sends btrfs_bio to mirrored devices, btrfs calls
>> btrfs_bio_end_io() when all the mirroring bios are completed. If that
>> btrfs_bio was split, it is from btrfs_clone_bioset and its end_io
>> function
>> is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
>> accesses the orig_bbio's bio context to increase the error count.
>>
>> That works well in most cases. However, if the end_io is called enough
>> fast, orig_bbio's bio context may not be properly set at that time. Since
>> the bio context is set when the orig_bbio (the last btrfs_bio) is sent to
>> devices, that might be too late for earlier split btrfs_bio's completion.
>> That will result in NULL pointer dereference.
>
> Mind to share more info on how the NULL pointer dereference happened?
>
> btrfs_split_bio() should always initialize the private pointer of the
> new bio to point to the original one.
>
> Thus I didn't see an immediate problem with this.
>
>
>>
>> That bug is easily reproducible by running btrfs/146 on zoned devices and
>> it shows the following trace.
>
> Furthermore, IIRC zoned device only supports SINGLE/DUP/RAID1 for data
> without RST.
>
> Then there should be split happening, but only mirrored writes.
> Thus it looks like the description doesn't really match the symptom.
>
>
>>
>>      [   20.923980][   T13] BUG: kernel NULL pointer dereference,
>> address: 0000000000000020
>>      [   20.925234][   T13] #PF: supervisor read access in kernel mode
>>      [   20.926122][   T13] #PF: error_code(0x0000) - not-present page
>>      [   20.927118][   T13] PGD 0 P4D 0
>>      [   20.927607][   T13] Oops: Oops: 0000 [#1] PREEMPT SMP PTI
>>      [   20.928424][   T13] CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1
>> Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
>>      [   20.929740][   T13] Hardware name: Bochs Bochs, BIOS Bochs
>> 01/01/2011
>>      [   20.930697][   T13] Workqueue: writeback wb_workfn (flush-
>> btrfs-5)
>>      [   20.931643][   T13] RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
>>      [   20.932573][ T1415] BTRFS error (device dm-0): bdev /dev/
>> mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
>>      [   20.932871][   T13] Code: ba e1 48 8b 7b 10 e8 f1 f5 f6 ff eb
>> da 48 81 bf 10 01 00 00 40 0c 33 a0 74 09 40 88 b5 f1 00 00 00 eb b8
>> 48 8b 85 18 01 00 00 <48> 8b 40 20 0f b7 50 24 f0 01 50 20 eb a3 0f 1f
>> 40 00 90 90 90 90
>>      [   20.936623][   T13] RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
>>      [   20.937543][   T13] RAX: 0000000000000000 RBX:
>> ffff888005a7f080 RCX: ffffc9000006f1dc
>>      [   20.938788][   T13] RDX: 0000000000000000 RSI:
>> 000000000000000a RDI: ffff888005a7f080
>>      [   20.940016][   T13] RBP: ffff888011dfc540 R08:
>> 0000000000000000 R09: 0000000000000001
>>      [   20.941227][   T13] R10: ffffffff82e508e0 R11:
>> 0000000000000005 R12: ffff88800ddfbe58
>>      [   20.942375][   T13] R13: ffff888005a7f080 R14:
>> ffff888005a7f158 R15: ffff888005a7f158
>>      [   20.943531][   T13] FS:  0000000000000000(0000)
>> GS:ffff88803ea80000(0000) knlGS:0000000000000000
>>      [   20.944838][   T13] CS:  0010 DS: 0000 ES: 0000 CR0:
>> 0000000080050033
>>      [   20.945811][   T13] CR2: 0000000000000020 CR3:
>> 0000000002e22006 CR4: 0000000000370ef0
>>      [   20.946984][   T13] DR0: 0000000000000000 DR1:
>> 0000000000000000 DR2: 0000000000000000
>>      [   20.948150][   T13] DR3: 0000000000000000 DR6:
>> 00000000fffe0ff0 DR7: 0000000000000400
>>      [   20.949327][   T13] Call Trace:
>>      [   20.949949][   T13]  <TASK>
>>      [   20.950374][   T13]  ? __die_body.cold+0x19/0x26
>>      [   20.951066][   T13]  ? page_fault_oops+0x13e/0x2b0
>>      [   20.951766][   T13]  ? _printk+0x58/0x73
>>      [   20.952358][   T13]  ? do_user_addr_fault+0x5f/0x750
>>      [   20.953120][   T13]  ? exc_page_fault+0x76/0x240
>>      [   20.953827][   T13]  ? asm_exc_page_fault+0x22/0x30
>>      [   20.954606][   T13]  ? btrfs_bio_end_io+0xae/0xc0 [btrfs]
>>      [   20.955616][   T13]  ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
>>      [   20.956682][   T13]  btrfs_orig_write_end_io+0x51/0x90 [btrfs]
>>      [   20.957769][   T13]  dm_submit_bio+0x5c2/0xa50 [dm_mod]
>>      [   20.958623][   T13]  ? find_held_lock+0x2b/0x80
>>      [   20.959339][   T13]  ? blk_try_enter_queue+0x90/0x1e0
>>      [   20.960228][   T13]  __submit_bio+0xe0/0x130
>>      [   20.960879][   T13]  ? ktime_get+0x10a/0x160
>>      [   20.961546][   T13]  ? lockdep_hardirqs_on+0x74/0x100
>>      [   20.962310][   T13]  submit_bio_noacct_nocheck+0x199/0x410
>>      [   20.963140][   T13]  btrfs_submit_bio+0x7d/0x150 [btrfs]
>>      [   20.964089][   T13]  btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
>>      [   20.965066][   T13]  ? lockdep_hardirqs_on+0x74/0x100
>>      [   20.965824][   T13]  ? __folio_start_writeback+0x10/0x2c0
>>      [   20.966659][   T13]  btrfs_submit_bbio+0x1c/0x40 [btrfs]
>>      [   20.967617][   T13]  submit_one_bio+0x44/0x60 [btrfs]
>>      [   20.968536][   T13]  submit_extent_folio+0x13f/0x330 [btrfs]
>>      [   20.969552][   T13]  ? btrfs_set_range_writeback+0xa3/0xd0
>> [btrfs]
>>      [   20.970625][   T13]  extent_writepage_io+0x18b/0x360 [btrfs]
>>      [   20.971632][   T13]  extent_write_locked_range+0x17c/0x340
>> [btrfs]
>>      [   20.972702][   T13]  ? __pfx_end_bbio_data_write+0x10/0x10
>> [btrfs]
>>      [   20.973857][   T13]  run_delalloc_cow+0x71/0xd0 [btrfs]
>>      [   20.974841][   T13]  btrfs_run_delalloc_range+0x176/0x500 [btrfs]
>>      [   20.975870][   T13]  ? find_lock_delalloc_range+0x119/0x260
>> [btrfs]
>>      [   20.976911][   T13]  writepage_delalloc+0x2ab/0x480 [btrfs]
>>      [   20.977792][   T13]  extent_write_cache_pages+0x236/0x7d0 [btrfs]
>>      [   20.978728][   T13]  btrfs_writepages+0x72/0x130 [btrfs]
>>      [   20.979531][   T13]  do_writepages+0xd4/0x240
>>      [   20.980111][   T13]  ? find_held_lock+0x2b/0x80
>>      [   20.980695][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
>>      [   20.981461][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
>>      [   20.982213][   T13]  __writeback_single_inode+0x5c/0x4c0
>>      [   20.982859][   T13]  ? do_raw_spin_unlock+0x49/0xb0
>>      [   20.983439][   T13]  writeback_sb_inodes+0x22c/0x560
>>      [   20.984079][   T13]  __writeback_inodes_wb+0x4c/0xe0
>>      [   20.984886][   T13]  wb_writeback+0x1d6/0x3f0
>>      [   20.985536][   T13]  wb_workfn+0x334/0x520
>>      [   20.986044][   T13]  process_one_work+0x1ee/0x570
>>      [   20.986580][   T13]  ? lock_is_held_type+0xc6/0x130
>>      [   20.987142][   T13]  worker_thread+0x1d1/0x3b0
>>      [   20.987918][   T13]  ? __pfx_worker_thread+0x10/0x10
>>      [   20.988690][   T13]  kthread+0xee/0x120
>>      [   20.989180][   T13]  ? __pfx_kthread+0x10/0x10
>>      [   20.989915][   T13]  ret_from_fork+0x30/0x50
>>      [   20.990615][   T13]  ? __pfx_kthread+0x10/0x10
>>      [   20.991336][   T13]  ret_from_fork_asm+0x1a/0x30
>>      [   20.992106][   T13]  </TASK>
>>      [   20.992482][   T13] Modules linked in: dm_mod btrfs
>> blake2b_generic xor raid6_pq rapl
>>      [   20.993406][   T13] CR2: 0000000000000020
>>      [   20.993884][   T13] ---[ end trace 0000000000000000 ]---
>>      [   20.993954][ T1415] BUG: kernel NULL pointer dereference,
>> address: 0000000000000020
>>
>> * Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios
>>
>> btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
>> called last among split bios. In that case, btrfs_orig_write_end_io()
>> sets
>> the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [1].
>> Otherwise, the increased orig_bio's bioc->error is not checked by anyone
>> and return BLK_STS_OK to the upper layer.
>
> That doesn't sound correct to me.
>
> The original bio always got its bio->__bi_remaining increased when it
> get cloned.
>
> And __bi_remaining is only decreased when the cloned or the original bio
> get its endio function called (bio_endio()).
>
> For cloned bios, it's mostly the same chained bio behavior, with extra
> btrfs write tolerance added.

OK, I see the point. For the cloned ones we can have the following case:

The profile is DUP/RAID1.

For stripe 0, we cloned the original bio, increased
orig_bio->__bi_remaining.

Then submitted the cloned bio.

But before submitting the original one, cloned one get finished first,
it call the cloned endio function, which calls back to the endio of the
original bio.

Then the endio function decrease the __bi_remaining to 0 of the original
bio, thus it continue to call the endio of the original bio, which freed
the original bio.

If it's really the case, please add some trace output to explain the
situation better.

>
> So the explanation doesn't look correct to me.
>
>>
>> [1] Actually, this is not true. Because we only increases orig_bioc-
>> >errors
>> by max_errors, the condition "atomic_read(&bioc->error) > bioc-
>> >max_errors"
>> is still not met if only one split btrfs_bio fails.
>>
>> * Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios
>>
>> In contrast to the above case, btrfs_bbio_propagate_error() is not
>> working
>> well if un-mirrored orig_bbio is completed last. It sets
>> orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
>> over-written by orig_bbio's completion status. If the status is
>> BLK_STS_OK,
>> the upper layer would not know the failure.
>>
>> * Solution
>>
>> Considering the above cases, we can only save the error status in the
>> orig_bbio itself as it is always available. Also, the saved error status
>> should be propagated when all the split btrfs_bios are finished (i.e,
>> bbio->pending_ios == 0).
>
> This looks like it will not handle the write error tolerance at all.
>
> If there is really some timing related problem, I'd recommend to queue
> all the split bios into a bio_list, and submit them all when all splits
> is done.

Talking about the solution, may be we can go this diff instead?

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 5d3f8bd406d9..a13f055fec4c 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -515,8 +515,10 @@ static void btrfs_submit_bio(struct bio *bio,
struct btrfs_io_context *bioc,
                 int total_devs = bioc->num_stripes;

                 bioc->orig_bio = bio;
+               bio_inc_remaining(bio);
                 for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
                         btrfs_submit_mirrored_bio(bioc, dev_nr);
+               bio_endio(bio);
         }
  }

The idea is to increase the remaining so that any early finished bio
will not trigger the endio of the original bio.

Or you can also go the old bio_list way, which alsoo should be fine.

Thanks,
Qu
>
> Otherwise we will not tolerate any write failures.
>
>>
>> This commit introduces "status" to btrfs_bbio and uses the last saved
>> error
>> status for bbio->bio.bi_status.
>>
>> With this commit, btrfs/146 on zoned devices does not hit the NULL
>> pointer
>> dereference.
>>
>> Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
>> CC: stable@vger.kernel.org # 6.6+
>> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>> ---
>>   fs/btrfs/bio.c | 33 +++++++++------------------------
>>   fs/btrfs/bio.h |  3 +++
>>   2 files changed, 12 insertions(+), 24 deletions(-)
>>
>> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
>> index 056f8a171bba..a43d88bdcae7 100644
>> --- a/fs/btrfs/bio.c
>> +++ b/fs/btrfs/bio.c
>> @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct
>> btrfs_fs_info *fs_info,
>>       bbio->end_io = end_io;
>>       bbio->private = private;
>>       atomic_set(&bbio->pending_ios, 1);
>> +    atomic_set(&bbio->status, BLK_STS_OK);
>>   }
>>
>>   /*
>> @@ -120,41 +121,25 @@ static void __btrfs_bio_end_io(struct btrfs_bio
>> *bbio)
>>       }
>>   }
>>
>> -static void btrfs_orig_write_end_io(struct bio *bio);
>> -
>> -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
>> -                       struct btrfs_bio *orig_bbio)
>> -{
>> -    /*
>> -     * For writes we tolerate nr_mirrors - 1 write failures, so we can't
>> -     * just blindly propagate a write failure here.  Instead
>> increment the
>> -     * error count in the original I/O context so that it is
>> guaranteed to
>> -     * be larger than the error tolerance.
>> -     */
>> -    if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
>> -        struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
>> -        struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
>> -
>> -        atomic_add(orig_bioc->max_errors, &orig_bioc->error);
>> -    } else {
>> -        orig_bbio->bio.bi_status = bbio->bio.bi_status;
>> -    }
>> -}
>> -
>>   void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
>>   {
>>       bbio->bio.bi_status = status;
>>       if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
>>           struct btrfs_bio *orig_bbio = bbio->private;
>>
>> -        if (bbio->bio.bi_status)
>> -            btrfs_bbio_propagate_error(bbio, orig_bbio);
>> +        /* Save the last error. */
>> +        if (bbio->bio.bi_status != BLK_STS_OK)
>> +            atomic_set(&orig_bbio->status, bbio->bio.bi_status);
>>           btrfs_cleanup_bio(bbio);
>>           bbio = orig_bbio;
>>       }
>>
>> -    if (atomic_dec_and_test(&bbio->pending_ios))
>> +    if (atomic_dec_and_test(&bbio->pending_ios)) {
>> +        /* Load split bio's error which might be set above. */
>> +        if (status == BLK_STS_OK)
>> +            bbio->bio.bi_status = atomic_read(&bbio->status);
>>           __btrfs_bio_end_io(bbio);
>> +    }
>>   }
>>
>>   static int next_repair_mirror(struct btrfs_failed_bio *fbio, int
>> cur_mirror)
>> diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
>> index e48612340745..b8f7f6071bc2 100644
>> --- a/fs/btrfs/bio.h
>> +++ b/fs/btrfs/bio.h
>> @@ -79,6 +79,9 @@ struct btrfs_bio {
>>       /* File system that this I/O operates on. */
>>       struct btrfs_fs_info *fs_info;
>>
>> +    /* Set the error status of split bio. */
>> +    atomic_t status;
>> +
>
> The same as David, please do not abuse atomic_t for this purpose.
>
> Thanks,
> Qu
>
>>       /*
>>        * This member must come last, bio_alloc_bioset will allocate
>> enough
>>        * bytes for entire btrfs_bio but relies on bio being last.
>
>
Qu Wenruo Oct. 9, 2024, 11:11 p.m. UTC | #5
在 2024/10/10 08:28, Qu Wenruo 写道:
>
>
> 在 2024/10/10 08:08, Qu Wenruo 写道:
[...]
>> And __bi_remaining is only decreased when the cloned or the original bio
>> get its endio function called (bio_endio()).
>>
>> For cloned bios, it's mostly the same chained bio behavior, with extra
>> btrfs write tolerance added.
>
> OK, I see the point. For the cloned ones we can have the following case:
>
> The profile is DUP/RAID1.
>
> For stripe 0, we cloned the original bio, increased
> orig_bio->__bi_remaining.
>
> Then submitted the cloned bio.
>
> But before submitting the original one, cloned one get finished first,
> it call the cloned endio function, which calls back to the endio of the
> original bio.
>
> Then the endio function decrease the __bi_remaining to 0 of the original
> bio, thus it continue to call the endio of the original bio, which freed
> the original bio.

My bad, this is not possible.

The original bio will have 1 as __bi_remaining as the initial value.

So even if the cloned one is finished first, the __bi_remaining will
only stay at 1, not reaching 0, so the original bio will not finish,
thus impossible to free the original bio.

I need to dig further deeper to find out why the NULL pointer
dereference happens.

Thanks,
Qu
Naohiro Aota Oct. 10, 2024, 5:51 a.m. UTC | #6
On Wed, Oct 09, 2024 at 06:59:55PM GMT, David Sterba wrote:
> On Thu, Oct 10, 2024 at 12:57:50AM +0900, Naohiro Aota wrote:
> > The purpose of btrfs_bbio_propagate_error() shall be propagating an error
> > of split bio to its original btrfs_bio, and tell the error to the upper
> > layer. However, it's not working well on some cases.
> > 
> > * Case 1. Immediate (or quick) end_bio with an error
> > 
> > When btrfs sends btrfs_bio to mirrored devices, btrfs calls
> > btrfs_bio_end_io() when all the mirroring bios are completed. If that
> > btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function
> > is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
> > accesses the orig_bbio's bio context to increase the error count.
> > 
> > That works well in most cases. However, if the end_io is called enough
> > fast, orig_bbio's bio context may not be properly set at that time. Since
> > the bio context is set when the orig_bbio (the last btrfs_bio) is sent to
> > devices, that might be too late for earlier split btrfs_bio's completion.
> > That will result in NULL pointer dereference.
> > 
> > That bug is easily reproducible by running btrfs/146 on zoned devices and
> > it shows the following trace.
> > 
> >     [   20.923980][   T13] BUG: kernel NULL pointer dereference, address: 0000000000000020
> >     [   20.925234][   T13] #PF: supervisor read access in kernel mode
> >     [   20.926122][   T13] #PF: error_code(0x0000) - not-present page
> >     [   20.927118][   T13] PGD 0 P4D 0
> >     [   20.927607][   T13] Oops: Oops: 0000 [#1] PREEMPT SMP PTI
> >     [   20.928424][   T13] CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
> >     [   20.929740][   T13] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> >     [   20.930697][   T13] Workqueue: writeback wb_workfn (flush-btrfs-5)
> >     [   20.931643][   T13] RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
> >     [   20.932573][ T1415] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
> >     [   20.932871][   T13] Code: ba e1 48 8b 7b 10 e8 f1 f5 f6 ff eb da 48 81 bf 10 01 00 00 40 0c 33 a0 74 09 40 88 b5 f1 00 00 00 eb b8 48 8b 85 18 01 00 00 <48> 8b 40 20 0f b7 50 24 f0 01 50 20 eb a3 0f 1f 40 00 90 90 90 90
> >     [   20.936623][   T13] RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
> >     [   20.937543][   T13] RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc
> >     [   20.938788][   T13] RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080
> >     [   20.940016][   T13] RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001
> >     [   20.941227][   T13] R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58
> >     [   20.942375][   T13] R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158
> >     [   20.943531][   T13] FS:  0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000
> >     [   20.944838][   T13] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >     [   20.945811][   T13] CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0
> >     [   20.946984][   T13] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >     [   20.948150][   T13] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >     [   20.949327][   T13] Call Trace:
> >     [   20.949949][   T13]  <TASK>
> >     [   20.950374][   T13]  ? __die_body.cold+0x19/0x26
> >     [   20.951066][   T13]  ? page_fault_oops+0x13e/0x2b0
> >     [   20.951766][   T13]  ? _printk+0x58/0x73
> >     [   20.952358][   T13]  ? do_user_addr_fault+0x5f/0x750
> >     [   20.953120][   T13]  ? exc_page_fault+0x76/0x240
> >     [   20.953827][   T13]  ? asm_exc_page_fault+0x22/0x30
> >     [   20.954606][   T13]  ? btrfs_bio_end_io+0xae/0xc0 [btrfs]
> >     [   20.955616][   T13]  ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
> >     [   20.956682][   T13]  btrfs_orig_write_end_io+0x51/0x90 [btrfs]
> >     [   20.957769][   T13]  dm_submit_bio+0x5c2/0xa50 [dm_mod]
> >     [   20.958623][   T13]  ? find_held_lock+0x2b/0x80
> >     [   20.959339][   T13]  ? blk_try_enter_queue+0x90/0x1e0
> >     [   20.960228][   T13]  __submit_bio+0xe0/0x130
> >     [   20.960879][   T13]  ? ktime_get+0x10a/0x160
> >     [   20.961546][   T13]  ? lockdep_hardirqs_on+0x74/0x100
> >     [   20.962310][   T13]  submit_bio_noacct_nocheck+0x199/0x410
> >     [   20.963140][   T13]  btrfs_submit_bio+0x7d/0x150 [btrfs]
> >     [   20.964089][   T13]  btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
> >     [   20.965066][   T13]  ? lockdep_hardirqs_on+0x74/0x100
> >     [   20.965824][   T13]  ? __folio_start_writeback+0x10/0x2c0
> >     [   20.966659][   T13]  btrfs_submit_bbio+0x1c/0x40 [btrfs]
> >     [   20.967617][   T13]  submit_one_bio+0x44/0x60 [btrfs]
> >     [   20.968536][   T13]  submit_extent_folio+0x13f/0x330 [btrfs]
> >     [   20.969552][   T13]  ? btrfs_set_range_writeback+0xa3/0xd0 [btrfs]
> >     [   20.970625][   T13]  extent_writepage_io+0x18b/0x360 [btrfs]
> >     [   20.971632][   T13]  extent_write_locked_range+0x17c/0x340 [btrfs]
> >     [   20.972702][   T13]  ? __pfx_end_bbio_data_write+0x10/0x10 [btrfs]
> >     [   20.973857][   T13]  run_delalloc_cow+0x71/0xd0 [btrfs]
> >     [   20.974841][   T13]  btrfs_run_delalloc_range+0x176/0x500 [btrfs]
> >     [   20.975870][   T13]  ? find_lock_delalloc_range+0x119/0x260 [btrfs]
> >     [   20.976911][   T13]  writepage_delalloc+0x2ab/0x480 [btrfs]
> >     [   20.977792][   T13]  extent_write_cache_pages+0x236/0x7d0 [btrfs]
> >     [   20.978728][   T13]  btrfs_writepages+0x72/0x130 [btrfs]
> >     [   20.979531][   T13]  do_writepages+0xd4/0x240
> >     [   20.980111][   T13]  ? find_held_lock+0x2b/0x80
> >     [   20.980695][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
> >     [   20.981461][   T13]  ? wbc_attach_and_unlock_inode+0x12c/0x290
> >     [   20.982213][   T13]  __writeback_single_inode+0x5c/0x4c0
> >     [   20.982859][   T13]  ? do_raw_spin_unlock+0x49/0xb0
> >     [   20.983439][   T13]  writeback_sb_inodes+0x22c/0x560
> >     [   20.984079][   T13]  __writeback_inodes_wb+0x4c/0xe0
> >     [   20.984886][   T13]  wb_writeback+0x1d6/0x3f0
> >     [   20.985536][   T13]  wb_workfn+0x334/0x520
> >     [   20.986044][   T13]  process_one_work+0x1ee/0x570
> >     [   20.986580][   T13]  ? lock_is_held_type+0xc6/0x130
> >     [   20.987142][   T13]  worker_thread+0x1d1/0x3b0
> >     [   20.987918][   T13]  ? __pfx_worker_thread+0x10/0x10
> >     [   20.988690][   T13]  kthread+0xee/0x120
> >     [   20.989180][   T13]  ? __pfx_kthread+0x10/0x10
> >     [   20.989915][   T13]  ret_from_fork+0x30/0x50
> >     [   20.990615][   T13]  ? __pfx_kthread+0x10/0x10
> >     [   20.991336][   T13]  ret_from_fork_asm+0x1a/0x30
> >     [   20.992106][   T13]  </TASK>
> >     [   20.992482][   T13] Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl
> >     [   20.993406][   T13] CR2: 0000000000000020
> >     [   20.993884][   T13] ---[ end trace 0000000000000000 ]---
> >     [   20.993954][ T1415] BUG: kernel NULL pointer dereference, address: 0000000000000020
> > 
> > * Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios
> > 
> > btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
> > called last among split bios. In that case, btrfs_orig_write_end_io() sets
> > the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [1].
> > Otherwise, the increased orig_bio's bioc->error is not checked by anyone
> > and return BLK_STS_OK to the upper layer.
> > 
> > [1] Actually, this is not true. Because we only increases orig_bioc->errors
> > by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors"
> > is still not met if only one split btrfs_bio fails.
> > 
> > * Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios
> > 
> > In contrast to the above case, btrfs_bbio_propagate_error() is not working
> > well if un-mirrored orig_bbio is completed last. It sets
> > orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
> > over-written by orig_bbio's completion status. If the status is BLK_STS_OK,
> > the upper layer would not know the failure.
> > 
> > * Solution
> > 
> > Considering the above cases, we can only save the error status in the
> > orig_bbio itself as it is always available. Also, the saved error status
> > should be propagated when all the split btrfs_bios are finished (i.e,
> > bbio->pending_ios == 0).
> > 
> > This commit introduces "status" to btrfs_bbio and uses the last saved error
> > status for bbio->bio.bi_status.
> > 
> > With this commit, btrfs/146 on zoned devices does not hit the NULL pointer
> > dereference.
> > 
> > Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
> > CC: stable@vger.kernel.org # 6.6+
> > Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> > ---
> >  fs/btrfs/bio.c | 33 +++++++++------------------------
> >  fs/btrfs/bio.h |  3 +++
> >  2 files changed, 12 insertions(+), 24 deletions(-)
> > 
> > diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> > index 056f8a171bba..a43d88bdcae7 100644
> > --- a/fs/btrfs/bio.c
> > +++ b/fs/btrfs/bio.c
> > @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
> >  	bbio->end_io = end_io;
> >  	bbio->private = private;
> >  	atomic_set(&bbio->pending_ios, 1);
> > +	atomic_set(&bbio->status, BLK_STS_OK);
> >  }
> >  
> >  /*
> > @@ -120,41 +121,25 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
> >  	}
> >  }
> >  
> > -static void btrfs_orig_write_end_io(struct bio *bio);
> > -
> > -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
> > -				       struct btrfs_bio *orig_bbio)
> > -{
> > -	/*
> > -	 * For writes we tolerate nr_mirrors - 1 write failures, so we can't
> > -	 * just blindly propagate a write failure here.  Instead increment the
> > -	 * error count in the original I/O context so that it is guaranteed to
> > -	 * be larger than the error tolerance.
> > -	 */
> > -	if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
> > -		struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
> > -		struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
> > -
> > -		atomic_add(orig_bioc->max_errors, &orig_bioc->error);
> > -	} else {
> > -		orig_bbio->bio.bi_status = bbio->bio.bi_status;
> > -	}
> > -}
> > -
> >  void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
> >  {
> >  	bbio->bio.bi_status = status;
> >  	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
> >  		struct btrfs_bio *orig_bbio = bbio->private;
> >  
> > -		if (bbio->bio.bi_status)
> > -			btrfs_bbio_propagate_error(bbio, orig_bbio);
> > +		/* Save the last error. */
> > +		if (bbio->bio.bi_status != BLK_STS_OK)
> > +			atomic_set(&orig_bbio->status, bbio->bio.bi_status);
> >  		btrfs_cleanup_bio(bbio);
> >  		bbio = orig_bbio;
> >  	}
> >  
> > -	if (atomic_dec_and_test(&bbio->pending_ios))
> > +	if (atomic_dec_and_test(&bbio->pending_ios)) {
> > +		/* Load split bio's error which might be set above. */
> > +		if (status == BLK_STS_OK)
> > +			bbio->bio.bi_status = atomic_read(&bbio->status);
> >  		__btrfs_bio_end_io(bbio);
> > +	}
> >  }
> >  
> >  static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
> > diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
> > index e48612340745..b8f7f6071bc2 100644
> > --- a/fs/btrfs/bio.h
> > +++ b/fs/btrfs/bio.h
> > @@ -79,6 +79,9 @@ struct btrfs_bio {
> >  	/* File system that this I/O operates on. */
> >  	struct btrfs_fs_info *fs_info;
> >  
> > +	/* Set the error status of split bio. */
> > +	atomic_t status;
> 
> To repeat my comments from slack here. This should not be atomic when
> it's using only set and read, a plain int or blk_sts_t.

Yes, I'll change this to blk_status_t for better understanding.

> 
> The logic of storing the last error in btrfs_bio makes sense, I don't
> see other ways to do it. If there are multiple errors we can store the
> first one or the last one, we'd always lose some information. When it's
> the first one it could be set by cmpxchg.

Sure. I'll use cmpxchg to save the first one.
Christoph Hellwig Oct. 10, 2024, 7:06 a.m. UTC | #7
On Thu, Oct 10, 2024 at 09:41:04AM +1030, Qu Wenruo wrote:
> My bad, this is not possible.
>
> The original bio will have 1 as __bi_remaining as the initial value.
>
> So even if the cloned one is finished first, the __bi_remaining will
> only stay at 1, not reaching 0, so the original bio will not finish,
> thus impossible to free the original bio.
>
> I need to dig further deeper to find out why the NULL pointer
> dereference happens.

Yes, that was my assumption when writing this code.  So I'd love
to understand what is going on here, as it might indicate deeper
problems with the btrfs_bio lifetime.
Naohiro Aota Oct. 10, 2024, 8:53 a.m. UTC | #8
On Thu, Oct 10, 2024 at 09:06:20AM GMT, Christoph Hellwig wrote:
> On Thu, Oct 10, 2024 at 09:41:04AM +1030, Qu Wenruo wrote:
> > My bad, this is not possible.
> >
> > The original bio will have 1 as __bi_remaining as the initial value.
> >
> > So even if the cloned one is finished first, the __bi_remaining will
> > only stay at 1, not reaching 0, so the original bio will not finish,
> > thus impossible to free the original bio.
> >
> > I need to dig further deeper to find out why the NULL pointer
> > dereference happens.
> 
> Yes, that was my assumption when writing this code.  So I'd love
> to understand what is going on here, as it might indicate deeper
> problems with the btrfs_bio lifetime.
> 

First, there are two "clone" things regarding btrfs bio submission.

One is "clone"d in btrfs_split_bio(). It splits a btrfs_bio by allocating a
btrfs_bio from "btrfs_clone_bioset". It is logical space level split. Its
count is manged by original btrfs_bio's pennding_ios.

The other is "clone"d in btrfs_submit_mirrored_bio(). It clones the each
btrfs_bio's (after split) orig_bio (= btrfs_bio.bio) with
btrfs_alloc_clone(). It also sets the end_io to
"btrfs_clone_write_end_io". Its count is manged by BIO_CHAIN flag and
bio->__bi_remaining.

What my patch addresses is the first one, and the above comment from Qu is
about the second one. Let's say the first one as a "split clone" bio and
the second one as a "mirror clone" bio to distinguish them.

We also have two "original" bio. One is an originally large one coming from
the upper layer, and becomes the remaining part after split. The other is
original one*s* (because they are split, we have multiple original in this
regard) being mirrored in btrfs_submit_mirrored_bio(). Let's call them
"split original" and "mirroring original".


As Qu said, when a "mirror clone" bio completes, it calls bio_endio() for
"mirroring original" bio (after split). Since we have BIO_CHAIN flag and
__bi_remaining increased, until all the "mirror clone" bio + the "mirroring
original" bio completes, the end_io function (= btrfs_orig_write_end_io) is
never called. So, it is ensured that "mirroring original" bio's private
data is properly set at that time. So, it properly calls
btrfs_orig_write_end_io() and we can safely touch "mirroring original"
bio's data.

In btrfs_orig_write_end_io(), we call btrfs_bio_end_io(). In the function,
we call btrfs_bbio_propagate_error() to propagate the current "split clone"
btrfs_bio's error to "split original" btrfs_bio. To do so,
btrfs_bbio_propagate_error() accesses "split original" btrfs_bio
(non-mirrored case) and its private data (mirrored case).

Things can screw up here. If the btrfs_bio_end_io() for a "split clone"
btrfs_bio is called fast enough (or immediately in case of dm-error), the
"split original" btrfs_bio is not yet submitted. Thus, its private data,
which is setup in btrfs_submit_mirrored_bio(), is not yet have a proper
value. So, reading orig_bbio->bio.bi_private returns NULL, and accessing
its max_errors results in NULL pointer dereference.


Qu and I discussed about another solution: using bio_list to keep bios
around and send them at once. That ensures we have proper
"orig_bbio->bio.bi_private" ("split original" btrfs_bio's private
data). But, it also adds some latency which weaken RAID0. Also, the purpose
of btrfs_bbio_propagate_error() is propagating an error to "split original"
btrfs_bio. So, touching its private data is not necessary. We can just save
the failure status in btrfs_bio itself, which is always available until all
the bios complete.

Thanks,
Christoph Hellwig Oct. 10, 2024, 9:45 a.m. UTC | #9
On Thu, Oct 10, 2024 at 08:53:52AM +0000, Naohiro Aota wrote:
> Things can screw up here. If the btrfs_bio_end_io() for a "split clone"
> btrfs_bio is called fast enough (or immediately in case of dm-error), the
> "split original" btrfs_bio is not yet submitted. Thus, its private data,
> which is setup in btrfs_submit_mirrored_bio(), is not yet have a proper
> value. So, reading orig_bbio->bio.bi_private returns NULL, and accessing
> its max_errors results in NULL pointer dereference.

Ok, thanks.  So yes, the cmpxchg based version of this should be fine.
David Sterba Oct. 10, 2024, 11:02 a.m. UTC | #10
On Thu, Oct 10, 2024 at 05:51:50AM +0000, Naohiro Aota wrote:
> > > -		if (bbio->bio.bi_status)
> > > -			btrfs_bbio_propagate_error(bbio, orig_bbio);
> > > +		/* Save the last error. */
> > > +		if (bbio->bio.bi_status != BLK_STS_OK)
> > > +			atomic_set(&orig_bbio->status, bbio->bio.bi_status);
> > >  		btrfs_cleanup_bio(bbio);
> > >  		bbio = orig_bbio;
> > >  	}
> > >  
> > > -	if (atomic_dec_and_test(&bbio->pending_ios))
> > > +	if (atomic_dec_and_test(&bbio->pending_ios)) {
> > > +		/* Load split bio's error which might be set above. */
> > > +		if (status == BLK_STS_OK)
> > > +			bbio->bio.bi_status = atomic_read(&bbio->status);
> > >  		__btrfs_bio_end_io(bbio);
> > > +	}
> > >  }
> > >  
> > >  static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
> > > diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
> > > index e48612340745..b8f7f6071bc2 100644
> > > --- a/fs/btrfs/bio.h
> > > +++ b/fs/btrfs/bio.h
> > > @@ -79,6 +79,9 @@ struct btrfs_bio {
> > >  	/* File system that this I/O operates on. */
> > >  	struct btrfs_fs_info *fs_info;
> > >  
> > > +	/* Set the error status of split bio. */
> > > +	atomic_t status;
> > 
> > To repeat my comments from slack here. This should not be atomic when
> > it's using only set and read, a plain int or blk_sts_t.
> 
> Yes, I'll change this to blk_status_t for better understanding.
> 
> > 
> > The logic of storing the last error in btrfs_bio makes sense, I don't
> > see other ways to do it. If there are multiple errors we can store the
> > first one or the last one, we'd always lose some information. When it's
> > the first one it could be set by cmpxchg.
> 
> Sure. I'll use cmpxchg to save the first one.

The type blk_status_t is a u8, this works with cmpxchg, at least on
x86_64 and the common architectures.
kernel test robot Oct. 12, 2024, 6:12 a.m. UTC | #11
Hi Naohiro,

kernel test robot noticed the following build warnings:

[auto build test WARNING on kdave/for-next]
[also build test WARNING on linus/master v6.12-rc2 next-20241011]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Naohiro-Aota/btrfs-fix-error-propagation-of-split-bios/20241010-001915
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
patch link:    https://lore.kernel.org/r/db5c272fc27c59ddded5c691373c26458698cb1a.1728489285.git.naohiro.aota%40wdc.com
patch subject: [PATCH] btrfs: fix error propagation of split bios
config: i386-randconfig-061-20241012 (https://download.01.org/0day-ci/archive/20241012/202410121307.AKR3Gyyc-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241012/202410121307.AKR3Gyyc-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410121307.AKR3Gyyc-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> fs/btrfs/bio.c:125:65: sparse: sparse: incorrect type in argument 2 (different base types) @@     expected int i @@     got restricted blk_status_t [usertype] bi_status @@
   fs/btrfs/bio.c:125:65: sparse:     expected int i
   fs/btrfs/bio.c:125:65: sparse:     got restricted blk_status_t [usertype] bi_status
>> fs/btrfs/bio.c:133:45: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted blk_status_t [usertype] bi_status @@     got int @@
   fs/btrfs/bio.c:133:45: sparse:     expected restricted blk_status_t [usertype] bi_status
   fs/btrfs/bio.c:133:45: sparse:     got int

vim +125 fs/btrfs/bio.c

   116	
   117	void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
   118	{
   119		bbio->bio.bi_status = status;
   120		if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
   121			struct btrfs_bio *orig_bbio = bbio->private;
   122	
   123			/* Save the last error. */
   124			if (bbio->bio.bi_status != BLK_STS_OK)
 > 125				atomic_set(&orig_bbio->status, bbio->bio.bi_status);
   126			btrfs_cleanup_bio(bbio);
   127			bbio = orig_bbio;
   128		}
   129	
   130		if (atomic_dec_and_test(&bbio->pending_ios)) {
   131			/* Load split bio's error which might be set above. */
   132			if (status == BLK_STS_OK)
 > 133				bbio->bio.bi_status = atomic_read(&bbio->status);
   134			__btrfs_bio_end_io(bbio);
   135		}
   136	}
   137
diff mbox series

Patch

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 056f8a171bba..a43d88bdcae7 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -49,6 +49,7 @@  void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
 	bbio->end_io = end_io;
 	bbio->private = private;
 	atomic_set(&bbio->pending_ios, 1);
+	atomic_set(&bbio->status, BLK_STS_OK);
 }
 
 /*
@@ -120,41 +121,25 @@  static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
 	}
 }
 
-static void btrfs_orig_write_end_io(struct bio *bio);
-
-static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
-				       struct btrfs_bio *orig_bbio)
-{
-	/*
-	 * For writes we tolerate nr_mirrors - 1 write failures, so we can't
-	 * just blindly propagate a write failure here.  Instead increment the
-	 * error count in the original I/O context so that it is guaranteed to
-	 * be larger than the error tolerance.
-	 */
-	if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
-		struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
-		struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
-
-		atomic_add(orig_bioc->max_errors, &orig_bioc->error);
-	} else {
-		orig_bbio->bio.bi_status = bbio->bio.bi_status;
-	}
-}
-
 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 {
 	bbio->bio.bi_status = status;
 	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
 		struct btrfs_bio *orig_bbio = bbio->private;
 
-		if (bbio->bio.bi_status)
-			btrfs_bbio_propagate_error(bbio, orig_bbio);
+		/* Save the last error. */
+		if (bbio->bio.bi_status != BLK_STS_OK)
+			atomic_set(&orig_bbio->status, bbio->bio.bi_status);
 		btrfs_cleanup_bio(bbio);
 		bbio = orig_bbio;
 	}
 
-	if (atomic_dec_and_test(&bbio->pending_ios))
+	if (atomic_dec_and_test(&bbio->pending_ios)) {
+		/* Load split bio's error which might be set above. */
+		if (status == BLK_STS_OK)
+			bbio->bio.bi_status = atomic_read(&bbio->status);
 		__btrfs_bio_end_io(bbio);
+	}
 }
 
 static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index e48612340745..b8f7f6071bc2 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -79,6 +79,9 @@  struct btrfs_bio {
 	/* File system that this I/O operates on. */
 	struct btrfs_fs_info *fs_info;
 
+	/* Set the error status of split bio. */
+	atomic_t status;
+
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
 	 * bytes for entire btrfs_bio but relies on bio being last.