diff mbox series

[1/2] btrfs: fix use-after-free in btrfs_encoded_read_endio

Message ID 88ede763421d4f9847a057bdc944cb9c684e8cae.1731316882.git.jth@kernel.org (mailing list archive)
State New, archived
Headers show
Series btrfs: fix use-after-free in btrfs_encoded_read_endio | expand

Commit Message

Johannes Thumshirn Nov. 11, 2024, 9:28 a.m. UTC
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>

Shinichiro reported the following use-after free that sometimes is
happening in our CI system when running fstests' btrfs/284 on a TCMU
runner device:

   ==================================================================
   BUG: KASAN: slab-use-after-free in lock_release+0x708/0x780
   Read of size 8 at addr ffff888106a83f18 by task kworker/u80:6/219

   CPU: 8 UID: 0 PID: 219 Comm: kworker/u80:6 Not tainted 6.12.0-rc6-kts+ #15
   Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020
   Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
   Call Trace:
    <TASK>
    dump_stack_lvl+0x6e/0xa0
    ? lock_release+0x708/0x780
    print_report+0x174/0x505
    ? lock_release+0x708/0x780
    ? __virt_addr_valid+0x224/0x410
    ? lock_release+0x708/0x780
    kasan_report+0xda/0x1b0
    ? lock_release+0x708/0x780
    ? __wake_up+0x44/0x60
    lock_release+0x708/0x780
    ? __pfx_lock_release+0x10/0x10
    ? __pfx_do_raw_spin_lock+0x10/0x10
    ? lock_is_held_type+0x9a/0x110
    _raw_spin_unlock_irqrestore+0x1f/0x60
    __wake_up+0x44/0x60
    btrfs_encoded_read_endio+0x14b/0x190 [btrfs]
    btrfs_check_read_bio+0x8d9/0x1360 [btrfs]
    ? lock_release+0x1b0/0x780
    ? trace_lock_acquire+0x12f/0x1a0
    ? __pfx_btrfs_check_read_bio+0x10/0x10 [btrfs]
    ? process_one_work+0x7e3/0x1460
    ? lock_acquire+0x31/0xc0
    ? process_one_work+0x7e3/0x1460
    process_one_work+0x85c/0x1460
    ? __pfx_process_one_work+0x10/0x10
    ? assign_work+0x16c/0x240
    worker_thread+0x5e6/0xfc0
    ? __pfx_worker_thread+0x10/0x10
    kthread+0x2c3/0x3a0
    ? __pfx_kthread+0x10/0x10
    ret_from_fork+0x31/0x70
    ? __pfx_kthread+0x10/0x10
    ret_from_fork_asm+0x1a/0x30
    </TASK>

   Allocated by task 3661:
    kasan_save_stack+0x30/0x50
    kasan_save_track+0x14/0x30
    __kasan_kmalloc+0xaa/0xb0
    btrfs_encoded_read_regular_fill_pages+0x16c/0x6d0 [btrfs]
    send_extent_data+0xf0f/0x24a0 [btrfs]
    process_extent+0x48a/0x1830 [btrfs]
    changed_cb+0x178b/0x2ea0 [btrfs]
    btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
    _btrfs_ioctl_send+0x117/0x330 [btrfs]
    btrfs_ioctl+0x184a/0x60a0 [btrfs]
    __x64_sys_ioctl+0x12e/0x1a0
    do_syscall_64+0x95/0x180
    entry_SYSCALL_64_after_hwframe+0x76/0x7e

   Freed by task 3661:
    kasan_save_stack+0x30/0x50
    kasan_save_track+0x14/0x30
    kasan_save_free_info+0x3b/0x70
    __kasan_slab_free+0x4f/0x70
    kfree+0x143/0x490
    btrfs_encoded_read_regular_fill_pages+0x531/0x6d0 [btrfs]
    send_extent_data+0xf0f/0x24a0 [btrfs]
    process_extent+0x48a/0x1830 [btrfs]
    changed_cb+0x178b/0x2ea0 [btrfs]
    btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
    _btrfs_ioctl_send+0x117/0x330 [btrfs]
    btrfs_ioctl+0x184a/0x60a0 [btrfs]
    __x64_sys_ioctl+0x12e/0x1a0
    do_syscall_64+0x95/0x180
    entry_SYSCALL_64_after_hwframe+0x76/0x7e

   The buggy address belongs to the object at ffff888106a83f00
    which belongs to the cache kmalloc-rnd-07-96 of size 96
   The buggy address is located 24 bytes inside of
    freed 96-byte region [ffff888106a83f00, ffff888106a83f60)

   The buggy address belongs to the physical page:
   page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888106a83800 pfn:0x106a83
   flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
   page_type: f5(slab)
   raw: 0017ffffc0000000 ffff888100053680 ffffea0004917200 0000000000000004
   raw: ffff888106a83800 0000000080200019 00000001f5000000 0000000000000000
   page dumped because: kasan: bad access detected

   Memory state around the buggy address:
    ffff888106a83e00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
    ffff888106a83e80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
   >ffff888106a83f00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
                               ^
    ffff888106a83f80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
    ffff888106a84000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
   ==================================================================

Further analyzing the trace and the crash dump's vmcore file shows that
the wake_up() call in btrfs_encoded_read_endio() is calling wake_up() on
the wait_queue that is in the private data passed to the end_io handler.

Commit 4ff47df40447 ("btrfs: move priv off stack in
btrfs_encoded_read_regular_fill_pages()") moved 'struct
btrfs_encoded_read_private' off the stack.

Before that commit one can see a corruption of the private data when
analyzing the vmcore after a crash:

*(struct btrfs_encoded_read_private *)0xffff88815626eec8 = {
	.wait = (wait_queue_head_t){
		.lock = (spinlock_t){
			.rlock = (struct raw_spinlock){
				.raw_lock = (arch_spinlock_t){
					.val = (atomic_t){
						.counter = (int)-2005885696,
					},
					.locked = (u8)0,
					.pending = (u8)157,
					.locked_pending = (u16)40192,
					.tail = (u16)34928,
				},
				.magic = (unsigned int)536325682,
				.owner_cpu = (unsigned int)29,
				.owner = (void *)__SCT__tp_func_btrfs_transaction_commit+0x0 = 0x0,
				.dep_map = (struct lockdep_map){
					.key = (struct lock_class_key *)0xffff8881575a3b6c,
					.class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
					.name = (const char *)0xffff88815626f100 = "",
					.wait_type_outer = (u8)37,
					.wait_type_inner = (u8)178,
					.lock_type = (u8)154,
				},
			},
			.__padding = (u8 [24]){ 0, 157, 112, 136, 50, 174, 247, 31, 29 },
			.dep_map = (struct lockdep_map){
				.key = (struct lock_class_key *)0xffff8881575a3b6c,
				.class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
				.name = (const char *)0xffff88815626f100 = "",
				.wait_type_outer = (u8)37,
				.wait_type_inner = (u8)178,
				.lock_type = (u8)154,
			},
		},
		.head = (struct list_head){
			.next = (struct list_head *)0x112cca,
			.prev = (struct list_head *)0x47,
		},
	},
	.pending = (atomic_t){
		.counter = (int)-1491499288,
	},
	.status = (blk_status_t)130,
}

Move the call to bio_put() before the atomic_test operation and
also change atomic_dec_return() to atomic_dec_and_test() to fix the
corruption, as atomic_dec_return() is defined as two instructions on
x86_64, whereas atomic_dec_and_test() is defineda s a single atomic
operation.

Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Suggested-by: Damien Le Moal <Damien.LeMoal@wdc.com>
Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl")
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Comments

Filipe Manana Nov. 11, 2024, 1:27 p.m. UTC | #1
On Mon, Nov 11, 2024 at 9:28 AM Johannes Thumshirn <jth@kernel.org> wrote:
>
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>
> Shinichiro reported the following use-after free that sometimes is
> happening in our CI system when running fstests' btrfs/284 on a TCMU
> runner device:
>
>    ==================================================================
>    BUG: KASAN: slab-use-after-free in lock_release+0x708/0x780
>    Read of size 8 at addr ffff888106a83f18 by task kworker/u80:6/219
>
>    CPU: 8 UID: 0 PID: 219 Comm: kworker/u80:6 Not tainted 6.12.0-rc6-kts+ #15
>    Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020
>    Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
>    Call Trace:
>     <TASK>
>     dump_stack_lvl+0x6e/0xa0
>     ? lock_release+0x708/0x780
>     print_report+0x174/0x505
>     ? lock_release+0x708/0x780
>     ? __virt_addr_valid+0x224/0x410
>     ? lock_release+0x708/0x780
>     kasan_report+0xda/0x1b0
>     ? lock_release+0x708/0x780
>     ? __wake_up+0x44/0x60
>     lock_release+0x708/0x780
>     ? __pfx_lock_release+0x10/0x10
>     ? __pfx_do_raw_spin_lock+0x10/0x10
>     ? lock_is_held_type+0x9a/0x110
>     _raw_spin_unlock_irqrestore+0x1f/0x60
>     __wake_up+0x44/0x60
>     btrfs_encoded_read_endio+0x14b/0x190 [btrfs]
>     btrfs_check_read_bio+0x8d9/0x1360 [btrfs]
>     ? lock_release+0x1b0/0x780
>     ? trace_lock_acquire+0x12f/0x1a0
>     ? __pfx_btrfs_check_read_bio+0x10/0x10 [btrfs]
>     ? process_one_work+0x7e3/0x1460
>     ? lock_acquire+0x31/0xc0
>     ? process_one_work+0x7e3/0x1460
>     process_one_work+0x85c/0x1460
>     ? __pfx_process_one_work+0x10/0x10
>     ? assign_work+0x16c/0x240
>     worker_thread+0x5e6/0xfc0
>     ? __pfx_worker_thread+0x10/0x10
>     kthread+0x2c3/0x3a0
>     ? __pfx_kthread+0x10/0x10
>     ret_from_fork+0x31/0x70
>     ? __pfx_kthread+0x10/0x10
>     ret_from_fork_asm+0x1a/0x30
>     </TASK>
>
>    Allocated by task 3661:
>     kasan_save_stack+0x30/0x50
>     kasan_save_track+0x14/0x30
>     __kasan_kmalloc+0xaa/0xb0
>     btrfs_encoded_read_regular_fill_pages+0x16c/0x6d0 [btrfs]
>     send_extent_data+0xf0f/0x24a0 [btrfs]
>     process_extent+0x48a/0x1830 [btrfs]
>     changed_cb+0x178b/0x2ea0 [btrfs]
>     btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
>     _btrfs_ioctl_send+0x117/0x330 [btrfs]
>     btrfs_ioctl+0x184a/0x60a0 [btrfs]
>     __x64_sys_ioctl+0x12e/0x1a0
>     do_syscall_64+0x95/0x180
>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
>    Freed by task 3661:
>     kasan_save_stack+0x30/0x50
>     kasan_save_track+0x14/0x30
>     kasan_save_free_info+0x3b/0x70
>     __kasan_slab_free+0x4f/0x70
>     kfree+0x143/0x490
>     btrfs_encoded_read_regular_fill_pages+0x531/0x6d0 [btrfs]
>     send_extent_data+0xf0f/0x24a0 [btrfs]
>     process_extent+0x48a/0x1830 [btrfs]
>     changed_cb+0x178b/0x2ea0 [btrfs]
>     btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
>     _btrfs_ioctl_send+0x117/0x330 [btrfs]
>     btrfs_ioctl+0x184a/0x60a0 [btrfs]
>     __x64_sys_ioctl+0x12e/0x1a0
>     do_syscall_64+0x95/0x180
>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
>    The buggy address belongs to the object at ffff888106a83f00
>     which belongs to the cache kmalloc-rnd-07-96 of size 96
>    The buggy address is located 24 bytes inside of
>     freed 96-byte region [ffff888106a83f00, ffff888106a83f60)
>
>    The buggy address belongs to the physical page:
>    page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888106a83800 pfn:0x106a83
>    flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
>    page_type: f5(slab)
>    raw: 0017ffffc0000000 ffff888100053680 ffffea0004917200 0000000000000004
>    raw: ffff888106a83800 0000000080200019 00000001f5000000 0000000000000000
>    page dumped because: kasan: bad access detected
>
>    Memory state around the buggy address:
>     ffff888106a83e00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>     ffff888106a83e80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>    >ffff888106a83f00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>                                ^
>     ffff888106a83f80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>     ffff888106a84000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>    ==================================================================
>
> Further analyzing the trace and the crash dump's vmcore file shows that
> the wake_up() call in btrfs_encoded_read_endio() is calling wake_up() on
> the wait_queue that is in the private data passed to the end_io handler.
>
> Commit 4ff47df40447 ("btrfs: move priv off stack in
> btrfs_encoded_read_regular_fill_pages()") moved 'struct
> btrfs_encoded_read_private' off the stack.
>
> Before that commit one can see a corruption of the private data when
> analyzing the vmcore after a crash:

Can you highlight what's the corruption?
Just dumping a large structure isn't immediately obvious, I suppose
you mean it's related to the large negative values of the atomic
counters?

>
> *(struct btrfs_encoded_read_private *)0xffff88815626eec8 = {
>         .wait = (wait_queue_head_t){
>                 .lock = (spinlock_t){
>                         .rlock = (struct raw_spinlock){
>                                 .raw_lock = (arch_spinlock_t){
>                                         .val = (atomic_t){
>                                                 .counter = (int)-2005885696,
>                                         },
>                                         .locked = (u8)0,
>                                         .pending = (u8)157,
>                                         .locked_pending = (u16)40192,
>                                         .tail = (u16)34928,
>                                 },
>                                 .magic = (unsigned int)536325682,
>                                 .owner_cpu = (unsigned int)29,
>                                 .owner = (void *)__SCT__tp_func_btrfs_transaction_commit+0x0 = 0x0,
>                                 .dep_map = (struct lockdep_map){
>                                         .key = (struct lock_class_key *)0xffff8881575a3b6c,
>                                         .class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
>                                         .name = (const char *)0xffff88815626f100 = "",
>                                         .wait_type_outer = (u8)37,
>                                         .wait_type_inner = (u8)178,
>                                         .lock_type = (u8)154,
>                                 },
>                         },
>                         .__padding = (u8 [24]){ 0, 157, 112, 136, 50, 174, 247, 31, 29 },
>                         .dep_map = (struct lockdep_map){
>                                 .key = (struct lock_class_key *)0xffff8881575a3b6c,
>                                 .class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
>                                 .name = (const char *)0xffff88815626f100 = "",
>                                 .wait_type_outer = (u8)37,
>                                 .wait_type_inner = (u8)178,
>                                 .lock_type = (u8)154,
>                         },
>                 },
>                 .head = (struct list_head){
>                         .next = (struct list_head *)0x112cca,
>                         .prev = (struct list_head *)0x47,
>                 },
>         },
>         .pending = (atomic_t){
>                 .counter = (int)-1491499288,
>         },
>         .status = (blk_status_t)130,
> }
>
> Move the call to bio_put() before the atomic_test operation and
> also change atomic_dec_return() to atomic_dec_and_test() to fix the
> corruption, as atomic_dec_return() is defined as two instructions on
> x86_64, whereas atomic_dec_and_test() is defineda s a single atomic
> operation.

And why does this fixes the problem?

Can we also get a description here about why the corruption happens?
Having this may be enough to understand why these changes fix the bug.

Thanks.


>
> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Suggested-by: Damien Le Moal <Damien.LeMoal@wdc.com>
> Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl")
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/inode.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 22b8e2764619..cb8b23a3e56b 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9089,7 +9089,8 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
>                  */
>                 WRITE_ONCE(priv->status, bbio->bio.bi_status);
>         }
> -       if (atomic_dec_return(&priv->pending) == 0) {
> +       bio_put(&bbio->bio);
> +       if (atomic_dec_and_test(&priv->pending)) {
>                 int err = blk_status_to_errno(READ_ONCE(priv->status));
>
>                 if (priv->uring_ctx) {
> @@ -9099,7 +9100,6 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
>                         wake_up(&priv->wait);
>                 }
>         }
> -       bio_put(&bbio->bio);
>  }
>
>  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
> --
> 2.43.0
>
>
Damien Le Moal Nov. 11, 2024, 1:44 p.m. UTC | #2
On 11/11/24 22:27, Filipe Manana wrote:
> On Mon, Nov 11, 2024 at 9:28 AM Johannes Thumshirn <jth@kernel.org> wrote:
>>
>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> Shinichiro reported the following use-after free that sometimes is
>> happening in our CI system when running fstests' btrfs/284 on a TCMU
>> runner device:
>>
>>    ==================================================================
>>    BUG: KASAN: slab-use-after-free in lock_release+0x708/0x780
>>    Read of size 8 at addr ffff888106a83f18 by task kworker/u80:6/219
>>
>>    CPU: 8 UID: 0 PID: 219 Comm: kworker/u80:6 Not tainted 6.12.0-rc6-kts+ #15
>>    Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020
>>    Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
>>    Call Trace:
>>     <TASK>
>>     dump_stack_lvl+0x6e/0xa0
>>     ? lock_release+0x708/0x780
>>     print_report+0x174/0x505
>>     ? lock_release+0x708/0x780
>>     ? __virt_addr_valid+0x224/0x410
>>     ? lock_release+0x708/0x780
>>     kasan_report+0xda/0x1b0
>>     ? lock_release+0x708/0x780
>>     ? __wake_up+0x44/0x60
>>     lock_release+0x708/0x780
>>     ? __pfx_lock_release+0x10/0x10
>>     ? __pfx_do_raw_spin_lock+0x10/0x10
>>     ? lock_is_held_type+0x9a/0x110
>>     _raw_spin_unlock_irqrestore+0x1f/0x60
>>     __wake_up+0x44/0x60
>>     btrfs_encoded_read_endio+0x14b/0x190 [btrfs]
>>     btrfs_check_read_bio+0x8d9/0x1360 [btrfs]
>>     ? lock_release+0x1b0/0x780
>>     ? trace_lock_acquire+0x12f/0x1a0
>>     ? __pfx_btrfs_check_read_bio+0x10/0x10 [btrfs]
>>     ? process_one_work+0x7e3/0x1460
>>     ? lock_acquire+0x31/0xc0
>>     ? process_one_work+0x7e3/0x1460
>>     process_one_work+0x85c/0x1460
>>     ? __pfx_process_one_work+0x10/0x10
>>     ? assign_work+0x16c/0x240
>>     worker_thread+0x5e6/0xfc0
>>     ? __pfx_worker_thread+0x10/0x10
>>     kthread+0x2c3/0x3a0
>>     ? __pfx_kthread+0x10/0x10
>>     ret_from_fork+0x31/0x70
>>     ? __pfx_kthread+0x10/0x10
>>     ret_from_fork_asm+0x1a/0x30
>>     </TASK>
>>
>>    Allocated by task 3661:
>>     kasan_save_stack+0x30/0x50
>>     kasan_save_track+0x14/0x30
>>     __kasan_kmalloc+0xaa/0xb0
>>     btrfs_encoded_read_regular_fill_pages+0x16c/0x6d0 [btrfs]
>>     send_extent_data+0xf0f/0x24a0 [btrfs]
>>     process_extent+0x48a/0x1830 [btrfs]
>>     changed_cb+0x178b/0x2ea0 [btrfs]
>>     btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
>>     _btrfs_ioctl_send+0x117/0x330 [btrfs]
>>     btrfs_ioctl+0x184a/0x60a0 [btrfs]
>>     __x64_sys_ioctl+0x12e/0x1a0
>>     do_syscall_64+0x95/0x180
>>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>
>>    Freed by task 3661:
>>     kasan_save_stack+0x30/0x50
>>     kasan_save_track+0x14/0x30
>>     kasan_save_free_info+0x3b/0x70
>>     __kasan_slab_free+0x4f/0x70
>>     kfree+0x143/0x490
>>     btrfs_encoded_read_regular_fill_pages+0x531/0x6d0 [btrfs]
>>     send_extent_data+0xf0f/0x24a0 [btrfs]
>>     process_extent+0x48a/0x1830 [btrfs]
>>     changed_cb+0x178b/0x2ea0 [btrfs]
>>     btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
>>     _btrfs_ioctl_send+0x117/0x330 [btrfs]
>>     btrfs_ioctl+0x184a/0x60a0 [btrfs]
>>     __x64_sys_ioctl+0x12e/0x1a0
>>     do_syscall_64+0x95/0x180
>>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>
>>    The buggy address belongs to the object at ffff888106a83f00
>>     which belongs to the cache kmalloc-rnd-07-96 of size 96
>>    The buggy address is located 24 bytes inside of
>>     freed 96-byte region [ffff888106a83f00, ffff888106a83f60)
>>
>>    The buggy address belongs to the physical page:
>>    page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888106a83800 pfn:0x106a83
>>    flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
>>    page_type: f5(slab)
>>    raw: 0017ffffc0000000 ffff888100053680 ffffea0004917200 0000000000000004
>>    raw: ffff888106a83800 0000000080200019 00000001f5000000 0000000000000000
>>    page dumped because: kasan: bad access detected
>>
>>    Memory state around the buggy address:
>>     ffff888106a83e00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>>     ffff888106a83e80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>>    >ffff888106a83f00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>>                                ^
>>     ffff888106a83f80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>>     ffff888106a84000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>    ==================================================================
>>
>> Further analyzing the trace and the crash dump's vmcore file shows that
>> the wake_up() call in btrfs_encoded_read_endio() is calling wake_up() on
>> the wait_queue that is in the private data passed to the end_io handler.
>>
>> Commit 4ff47df40447 ("btrfs: move priv off stack in
>> btrfs_encoded_read_regular_fill_pages()") moved 'struct
>> btrfs_encoded_read_private' off the stack.
>>
>> Before that commit one can see a corruption of the private data when
>> analyzing the vmcore after a crash:
> 
> Can you highlight what's the corruption?
> Just dumping a large structure isn't immediately obvious, I suppose
> you mean it's related to the large negative values of the atomic
> counters?
> 
>>
>> *(struct btrfs_encoded_read_private *)0xffff88815626eec8 = {
>>         .wait = (wait_queue_head_t){
>>                 .lock = (spinlock_t){
>>                         .rlock = (struct raw_spinlock){
>>                                 .raw_lock = (arch_spinlock_t){
>>                                         .val = (atomic_t){
>>                                                 .counter = (int)-2005885696,
>>                                         },
>>                                         .locked = (u8)0,
>>                                         .pending = (u8)157,
>>                                         .locked_pending = (u16)40192,
>>                                         .tail = (u16)34928,
>>                                 },
>>                                 .magic = (unsigned int)536325682,
>>                                 .owner_cpu = (unsigned int)29,
>>                                 .owner = (void *)__SCT__tp_func_btrfs_transaction_commit+0x0 = 0x0,
>>                                 .dep_map = (struct lockdep_map){
>>                                         .key = (struct lock_class_key *)0xffff8881575a3b6c,
>>                                         .class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
>>                                         .name = (const char *)0xffff88815626f100 = "",
>>                                         .wait_type_outer = (u8)37,
>>                                         .wait_type_inner = (u8)178,
>>                                         .lock_type = (u8)154,
>>                                 },
>>                         },
>>                         .__padding = (u8 [24]){ 0, 157, 112, 136, 50, 174, 247, 31, 29 },
>>                         .dep_map = (struct lockdep_map){
>>                                 .key = (struct lock_class_key *)0xffff8881575a3b6c,
>>                                 .class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
>>                                 .name = (const char *)0xffff88815626f100 = "",
>>                                 .wait_type_outer = (u8)37,
>>                                 .wait_type_inner = (u8)178,
>>                                 .lock_type = (u8)154,
>>                         },
>>                 },
>>                 .head = (struct list_head){
>>                         .next = (struct list_head *)0x112cca,
>>                         .prev = (struct list_head *)0x47,
>>                 },
>>         },
>>         .pending = (atomic_t){
>>                 .counter = (int)-1491499288,
>>         },
>>         .status = (blk_status_t)130,
>> }
>>
>> Move the call to bio_put() before the atomic_test operation and
>> also change atomic_dec_return() to atomic_dec_and_test() to fix the
>> corruption, as atomic_dec_return() is defined as two instructions on
>> x86_64, whereas atomic_dec_and_test() is defineda s a single atomic
>> operation.
> 
> And why does this fixes the problem?
> 
> Can we also get a description here about why the corruption happens?
> Having this may be enough to understand why these changes fix the bug.

Waking up the issuer before doing the bio put can result in the issuer function
returning before the bio is cleaned up. Which is weird... But thinking more
about this, it is probably not a bug in itself since the bbio is allocated and
not on the stack of the issuer.

What is definitely a bug is that there is nothing atomic with:

if (atomic_dec_return(&priv->pending) == 0)
	wake_up(&priv->wait);

on the completion side and

if (atomic_dec_return(&priv.pending))
	io_wait_event(priv.wait, !atomic_read(&priv.pending));

on the issuer side (btrfs_encoded_read_regular_fill_pages()).

The atomic_dec_return() on the completion side can cause io_wait_event() to
return and thus have the issuer return *BEFORE* the completion side has a chance
to call wake_up(&priv->wait), because the "if" is not fully processed yet as the
comparison with 0 is still needed. And given that priv is on the stack of the
issuer, the wake_up() call can endup referencing invalid memory.

Johannes,

I think more and more that the proper fix is to squash your 2 patches together
to avoid all these dangerous games played with the pending atomic.
Filipe Manana Nov. 11, 2024, 2:21 p.m. UTC | #3
On Mon, Nov 11, 2024 at 1:44 PM Damien Le Moal <dlemoal@kernel.org> wrote:
>
> On 11/11/24 22:27, Filipe Manana wrote:
> > On Mon, Nov 11, 2024 at 9:28 AM Johannes Thumshirn <jth@kernel.org> wrote:
> >>
> >> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> >>
> >> Shinichiro reported the following use-after free that sometimes is
> >> happening in our CI system when running fstests' btrfs/284 on a TCMU
> >> runner device:
> >>
> >>    ==================================================================
> >>    BUG: KASAN: slab-use-after-free in lock_release+0x708/0x780
> >>    Read of size 8 at addr ffff888106a83f18 by task kworker/u80:6/219
> >>
> >>    CPU: 8 UID: 0 PID: 219 Comm: kworker/u80:6 Not tainted 6.12.0-rc6-kts+ #15
> >>    Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020
> >>    Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
> >>    Call Trace:
> >>     <TASK>
> >>     dump_stack_lvl+0x6e/0xa0
> >>     ? lock_release+0x708/0x780
> >>     print_report+0x174/0x505
> >>     ? lock_release+0x708/0x780
> >>     ? __virt_addr_valid+0x224/0x410
> >>     ? lock_release+0x708/0x780
> >>     kasan_report+0xda/0x1b0
> >>     ? lock_release+0x708/0x780
> >>     ? __wake_up+0x44/0x60
> >>     lock_release+0x708/0x780
> >>     ? __pfx_lock_release+0x10/0x10
> >>     ? __pfx_do_raw_spin_lock+0x10/0x10
> >>     ? lock_is_held_type+0x9a/0x110
> >>     _raw_spin_unlock_irqrestore+0x1f/0x60
> >>     __wake_up+0x44/0x60
> >>     btrfs_encoded_read_endio+0x14b/0x190 [btrfs]
> >>     btrfs_check_read_bio+0x8d9/0x1360 [btrfs]
> >>     ? lock_release+0x1b0/0x780
> >>     ? trace_lock_acquire+0x12f/0x1a0
> >>     ? __pfx_btrfs_check_read_bio+0x10/0x10 [btrfs]
> >>     ? process_one_work+0x7e3/0x1460
> >>     ? lock_acquire+0x31/0xc0
> >>     ? process_one_work+0x7e3/0x1460
> >>     process_one_work+0x85c/0x1460
> >>     ? __pfx_process_one_work+0x10/0x10
> >>     ? assign_work+0x16c/0x240
> >>     worker_thread+0x5e6/0xfc0
> >>     ? __pfx_worker_thread+0x10/0x10
> >>     kthread+0x2c3/0x3a0
> >>     ? __pfx_kthread+0x10/0x10
> >>     ret_from_fork+0x31/0x70
> >>     ? __pfx_kthread+0x10/0x10
> >>     ret_from_fork_asm+0x1a/0x30
> >>     </TASK>
> >>
> >>    Allocated by task 3661:
> >>     kasan_save_stack+0x30/0x50
> >>     kasan_save_track+0x14/0x30
> >>     __kasan_kmalloc+0xaa/0xb0
> >>     btrfs_encoded_read_regular_fill_pages+0x16c/0x6d0 [btrfs]
> >>     send_extent_data+0xf0f/0x24a0 [btrfs]
> >>     process_extent+0x48a/0x1830 [btrfs]
> >>     changed_cb+0x178b/0x2ea0 [btrfs]
> >>     btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
> >>     _btrfs_ioctl_send+0x117/0x330 [btrfs]
> >>     btrfs_ioctl+0x184a/0x60a0 [btrfs]
> >>     __x64_sys_ioctl+0x12e/0x1a0
> >>     do_syscall_64+0x95/0x180
> >>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
> >>
> >>    Freed by task 3661:
> >>     kasan_save_stack+0x30/0x50
> >>     kasan_save_track+0x14/0x30
> >>     kasan_save_free_info+0x3b/0x70
> >>     __kasan_slab_free+0x4f/0x70
> >>     kfree+0x143/0x490
> >>     btrfs_encoded_read_regular_fill_pages+0x531/0x6d0 [btrfs]
> >>     send_extent_data+0xf0f/0x24a0 [btrfs]
> >>     process_extent+0x48a/0x1830 [btrfs]
> >>     changed_cb+0x178b/0x2ea0 [btrfs]
> >>     btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
> >>     _btrfs_ioctl_send+0x117/0x330 [btrfs]
> >>     btrfs_ioctl+0x184a/0x60a0 [btrfs]
> >>     __x64_sys_ioctl+0x12e/0x1a0
> >>     do_syscall_64+0x95/0x180
> >>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
> >>
> >>    The buggy address belongs to the object at ffff888106a83f00
> >>     which belongs to the cache kmalloc-rnd-07-96 of size 96
> >>    The buggy address is located 24 bytes inside of
> >>     freed 96-byte region [ffff888106a83f00, ffff888106a83f60)
> >>
> >>    The buggy address belongs to the physical page:
> >>    page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888106a83800 pfn:0x106a83
> >>    flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
> >>    page_type: f5(slab)
> >>    raw: 0017ffffc0000000 ffff888100053680 ffffea0004917200 0000000000000004
> >>    raw: ffff888106a83800 0000000080200019 00000001f5000000 0000000000000000
> >>    page dumped because: kasan: bad access detected
> >>
> >>    Memory state around the buggy address:
> >>     ffff888106a83e00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
> >>     ffff888106a83e80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
> >>    >ffff888106a83f00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
> >>                                ^
> >>     ffff888106a83f80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
> >>     ffff888106a84000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>    ==================================================================
> >>
> >> Further analyzing the trace and the crash dump's vmcore file shows that
> >> the wake_up() call in btrfs_encoded_read_endio() is calling wake_up() on
> >> the wait_queue that is in the private data passed to the end_io handler.
> >>
> >> Commit 4ff47df40447 ("btrfs: move priv off stack in
> >> btrfs_encoded_read_regular_fill_pages()") moved 'struct
> >> btrfs_encoded_read_private' off the stack.
> >>
> >> Before that commit one can see a corruption of the private data when
> >> analyzing the vmcore after a crash:
> >
> > Can you highlight what's the corruption?
> > Just dumping a large structure isn't immediately obvious, I suppose
> > you mean it's related to the large negative values of the atomic
> > counters?
> >
> >>
> >> *(struct btrfs_encoded_read_private *)0xffff88815626eec8 = {
> >>         .wait = (wait_queue_head_t){
> >>                 .lock = (spinlock_t){
> >>                         .rlock = (struct raw_spinlock){
> >>                                 .raw_lock = (arch_spinlock_t){
> >>                                         .val = (atomic_t){
> >>                                                 .counter = (int)-2005885696,
> >>                                         },
> >>                                         .locked = (u8)0,
> >>                                         .pending = (u8)157,
> >>                                         .locked_pending = (u16)40192,
> >>                                         .tail = (u16)34928,
> >>                                 },
> >>                                 .magic = (unsigned int)536325682,
> >>                                 .owner_cpu = (unsigned int)29,
> >>                                 .owner = (void *)__SCT__tp_func_btrfs_transaction_commit+0x0 = 0x0,
> >>                                 .dep_map = (struct lockdep_map){
> >>                                         .key = (struct lock_class_key *)0xffff8881575a3b6c,
> >>                                         .class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
> >>                                         .name = (const char *)0xffff88815626f100 = "",
> >>                                         .wait_type_outer = (u8)37,
> >>                                         .wait_type_inner = (u8)178,
> >>                                         .lock_type = (u8)154,
> >>                                 },
> >>                         },
> >>                         .__padding = (u8 [24]){ 0, 157, 112, 136, 50, 174, 247, 31, 29 },
> >>                         .dep_map = (struct lockdep_map){
> >>                                 .key = (struct lock_class_key *)0xffff8881575a3b6c,
> >>                                 .class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
> >>                                 .name = (const char *)0xffff88815626f100 = "",
> >>                                 .wait_type_outer = (u8)37,
> >>                                 .wait_type_inner = (u8)178,
> >>                                 .lock_type = (u8)154,
> >>                         },
> >>                 },
> >>                 .head = (struct list_head){
> >>                         .next = (struct list_head *)0x112cca,
> >>                         .prev = (struct list_head *)0x47,
> >>                 },
> >>         },
> >>         .pending = (atomic_t){
> >>                 .counter = (int)-1491499288,
> >>         },
> >>         .status = (blk_status_t)130,
> >> }
> >>
> >> Move the call to bio_put() before the atomic_test operation and
> >> also change atomic_dec_return() to atomic_dec_and_test() to fix the
> >> corruption, as atomic_dec_return() is defined as two instructions on
> >> x86_64, whereas atomic_dec_and_test() is defineda s a single atomic
> >> operation.
> >
> > And why does this fixes the problem?
> >
> > Can we also get a description here about why the corruption happens?
> > Having this may be enough to understand why these changes fix the bug.
>
> Waking up the issuer before doing the bio put can result in the issuer function
> returning before the bio is cleaned up. Which is weird... But thinking more
> about this, it is probably not a bug in itself since the bbio is allocated and
> not on the stack of the issuer.
>
> What is definitely a bug is that there is nothing atomic with:
>
> if (atomic_dec_return(&priv->pending) == 0)
>         wake_up(&priv->wait);
>
> on the completion side and
>
> if (atomic_dec_return(&priv.pending))
>         io_wait_event(priv.wait, !atomic_read(&priv.pending));
>
> on the issuer side (btrfs_encoded_read_regular_fill_pages()).
>
> The atomic_dec_return() on the completion side can cause io_wait_event() to
> return and thus have the issuer return *BEFORE* the completion side has a chance
> to call wake_up(&priv->wait), because the "if" is not fully processed yet as the
> comparison with 0 is still needed. And given that priv is on the stack of the
> issuer, the wake_up() call can endup referencing invalid memory.

Sure, I get that.

What I'm asking for is to have the change log explain how/why the bug
happens and why the proposed solution fixes the bug.
And not just dump a structure's content and say what the change does
without any explanation at all.

Every patch fixing a bug should have those details.


>
> Johannes,
>
> I think more and more that the proper fix is to squash your 2 patches together
> to avoid all these dangerous games played with the pending atomic.
>
> --
> Damien Le Moal
> Western Digital Research
Mark Harmstone Nov. 11, 2024, 6:17 p.m. UTC | #4
On 11/11/24 09:28, Johannes Thumshirn wrote:
> > 
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> Shinichiro reported the following use-after free that sometimes is
> happening in our CI system when running fstests' btrfs/284 on a TCMU
> runner device:
> 
>     ==================================================================
>     BUG: KASAN: slab-use-after-free in lock_release+0x708/0x780
>     Read of size 8 at addr ffff888106a83f18 by task kworker/u80:6/219
> 
>     CPU: 8 UID: 0 PID: 219 Comm: kworker/u80:6 Not tainted 6.12.0-rc6-kts+ #15
>     Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020
>     Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
>     Call Trace:
>      <TASK>
>      dump_stack_lvl+0x6e/0xa0
>      ? lock_release+0x708/0x780
>      print_report+0x174/0x505
>      ? lock_release+0x708/0x780
>      ? __virt_addr_valid+0x224/0x410
>      ? lock_release+0x708/0x780
>      kasan_report+0xda/0x1b0
>      ? lock_release+0x708/0x780
>      ? __wake_up+0x44/0x60
>      lock_release+0x708/0x780
>      ? __pfx_lock_release+0x10/0x10
>      ? __pfx_do_raw_spin_lock+0x10/0x10
>      ? lock_is_held_type+0x9a/0x110
>      _raw_spin_unlock_irqrestore+0x1f/0x60
>      __wake_up+0x44/0x60
>      btrfs_encoded_read_endio+0x14b/0x190 [btrfs]
>      btrfs_check_read_bio+0x8d9/0x1360 [btrfs]
>      ? lock_release+0x1b0/0x780
>      ? trace_lock_acquire+0x12f/0x1a0
>      ? __pfx_btrfs_check_read_bio+0x10/0x10 [btrfs]
>      ? process_one_work+0x7e3/0x1460
>      ? lock_acquire+0x31/0xc0
>      ? process_one_work+0x7e3/0x1460
>      process_one_work+0x85c/0x1460
>      ? __pfx_process_one_work+0x10/0x10
>      ? assign_work+0x16c/0x240
>      worker_thread+0x5e6/0xfc0
>      ? __pfx_worker_thread+0x10/0x10
>      kthread+0x2c3/0x3a0
>      ? __pfx_kthread+0x10/0x10
>      ret_from_fork+0x31/0x70
>      ? __pfx_kthread+0x10/0x10
>      ret_from_fork_asm+0x1a/0x30
>      </TASK>
> 
>     Allocated by task 3661:
>      kasan_save_stack+0x30/0x50
>      kasan_save_track+0x14/0x30
>      __kasan_kmalloc+0xaa/0xb0
>      btrfs_encoded_read_regular_fill_pages+0x16c/0x6d0 [btrfs]
>      send_extent_data+0xf0f/0x24a0 [btrfs]
>      process_extent+0x48a/0x1830 [btrfs]
>      changed_cb+0x178b/0x2ea0 [btrfs]
>      btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
>      _btrfs_ioctl_send+0x117/0x330 [btrfs]
>      btrfs_ioctl+0x184a/0x60a0 [btrfs]
>      __x64_sys_ioctl+0x12e/0x1a0
>      do_syscall_64+0x95/0x180
>      entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
>     Freed by task 3661:
>      kasan_save_stack+0x30/0x50
>      kasan_save_track+0x14/0x30
>      kasan_save_free_info+0x3b/0x70
>      __kasan_slab_free+0x4f/0x70
>      kfree+0x143/0x490
>      btrfs_encoded_read_regular_fill_pages+0x531/0x6d0 [btrfs]
>      send_extent_data+0xf0f/0x24a0 [btrfs]
>      process_extent+0x48a/0x1830 [btrfs]
>      changed_cb+0x178b/0x2ea0 [btrfs]
>      btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
>      _btrfs_ioctl_send+0x117/0x330 [btrfs]
>      btrfs_ioctl+0x184a/0x60a0 [btrfs]
>      __x64_sys_ioctl+0x12e/0x1a0
>      do_syscall_64+0x95/0x180
>      entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
>     The buggy address belongs to the object at ffff888106a83f00
>      which belongs to the cache kmalloc-rnd-07-96 of size 96
>     The buggy address is located 24 bytes inside of
>      freed 96-byte region [ffff888106a83f00, ffff888106a83f60)
> 
>     The buggy address belongs to the physical page:
>     page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888106a83800 pfn:0x106a83
>     flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
>     page_type: f5(slab)
>     raw: 0017ffffc0000000 ffff888100053680 ffffea0004917200 0000000000000004
>     raw: ffff888106a83800 0000000080200019 00000001f5000000 0000000000000000
>     page dumped because: kasan: bad access detected
> 
>     Memory state around the buggy address:
>      ffff888106a83e00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>      ffff888106a83e80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>     >ffff888106a83f00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>                                 ^
>      ffff888106a83f80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>      ffff888106a84000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>     ==================================================================
> 
> Further analyzing the trace and the crash dump's vmcore file shows that
> the wake_up() call in btrfs_encoded_read_endio() is calling wake_up() on
> the wait_queue that is in the private data passed to the end_io handler.
> 
> Commit 4ff47df40447 ("btrfs: move priv off stack in
> btrfs_encoded_read_regular_fill_pages()") moved 'struct
> btrfs_encoded_read_private' off the stack.
> 
> Before that commit one can see a corruption of the private data when
> analyzing the vmcore after a crash:
> 
> *(struct btrfs_encoded_read_private *)0xffff88815626eec8 = {
> 	.wait = (wait_queue_head_t){
> 		.lock = (spinlock_t){
> 			.rlock = (struct raw_spinlock){
> 				.raw_lock = (arch_spinlock_t){
> 					.val = (atomic_t){
> 						.counter = (int)-2005885696,
> 					},
> 					.locked = (u8)0,
> 					.pending = (u8)157,
> 					.locked_pending = (u16)40192,
> 					.tail = (u16)34928,
> 				},
> 				.magic = (unsigned int)536325682,
> 				.owner_cpu = (unsigned int)29,
> 				.owner = (void *)__SCT__tp_func_btrfs_transaction_commit+0x0 = 0x0,
> 				.dep_map = (struct lockdep_map){
> 					.key = (struct lock_class_key *)0xffff8881575a3b6c,
> 					.class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
> 					.name = (const char *)0xffff88815626f100 = "",
> 					.wait_type_outer = (u8)37,
> 					.wait_type_inner = (u8)178,
> 					.lock_type = (u8)154,
> 				},
> 			},
> 			.__padding = (u8 [24]){ 0, 157, 112, 136, 50, 174, 247, 31, 29 },
> 			.dep_map = (struct lockdep_map){
> 				.key = (struct lock_class_key *)0xffff8881575a3b6c,
> 				.class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
> 				.name = (const char *)0xffff88815626f100 = "",
> 				.wait_type_outer = (u8)37,
> 				.wait_type_inner = (u8)178,
> 				.lock_type = (u8)154,
> 			},
> 		},
> 		.head = (struct list_head){
> 			.next = (struct list_head *)0x112cca,
> 			.prev = (struct list_head *)0x47,
> 		},
> 	},
> 	.pending = (atomic_t){
> 		.counter = (int)-1491499288,
> 	},
> 	.status = (blk_status_t)130,
> }
> 
> Move the call to bio_put() before the atomic_test operation and
> also change atomic_dec_return() to atomic_dec_and_test() to fix the
> corruption, as atomic_dec_return() is defined as two instructions on
> x86_64, whereas atomic_dec_and_test() is defineda s a single atomic
> operation.
> 
> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Suggested-by: Damien Le Moal <Damien.LeMoal@wdc.com>
> Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl")
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/inode.c | 4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 22b8e2764619..cb8b23a3e56b 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9089,7 +9089,8 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
>   		 */
>   		WRITE_ONCE(priv->status, bbio->bio.bi_status);
>   	}
> -	if (atomic_dec_return(&priv->pending) == 0) {
> +	bio_put(&bbio->bio);
> +	if (atomic_dec_and_test(&priv->pending)) {
>   		int err = blk_status_to_errno(READ_ONCE(priv->status));
>   
>   		if (priv->uring_ctx) {
> @@ -9099,7 +9100,6 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
>   			wake_up(&priv->wait);
>   		}
>   	}
> -	bio_put(&bbio->bio);
>   }
>   
>   int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,

Thanks Johannes, this patch looks good to me. Thank you for fixing.

It looks like atomic_dec_return is a bit of a footgun... does this imply 
that all instances of atomic_dec_return(&foo) == 0 should actually be 
atomic_dec_and_test(&foo)?

Mark

Reviewed-by: Mark Harmstone <maharmstone@fb.com>
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 22b8e2764619..cb8b23a3e56b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9089,7 +9089,8 @@  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 		 */
 		WRITE_ONCE(priv->status, bbio->bio.bi_status);
 	}
-	if (atomic_dec_return(&priv->pending) == 0) {
+	bio_put(&bbio->bio);
+	if (atomic_dec_and_test(&priv->pending)) {
 		int err = blk_status_to_errno(READ_ONCE(priv->status));
 
 		if (priv->uring_ctx) {
@@ -9099,7 +9100,6 @@  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 			wake_up(&priv->wait);
 		}
 	}
-	bio_put(&bbio->bio);
 }
 
 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,