diff mbox series

[v4,4/4] blk-flush: reuse rq queuelist in flush state machine

Message ID 20230717040058.3993930-5-chengming.zhou@linux.dev (mailing list archive)
State New, archived
Headers show
Series blk-mq: optimize flush and request size | expand

Commit Message

Chengming Zhou July 17, 2023, 4 a.m. UTC
From: Chengming Zhou <zhouchengming@bytedance.com>

Since we don't need to maintain inflight flush_data requests list
anymore, we can reuse rq->queuelist for flush pending list.

Note in mq_flush_data_end_io(), we need to re-initialize rq->queuelist
before reusing it in the state machine when end, since the rq->rq_next
also reuse it, may have corrupted rq->queuelist by the driver.

This patch decrease the size of struct request by 16 bytes.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-flush.c      | 17 ++++++++++-------
 include/linux/blk-mq.h |  1 -
 2 files changed, 10 insertions(+), 8 deletions(-)

Comments

Friedrich Weber May 24, 2024, 4:07 p.m. UTC | #1
Hi,

some of our (Proxmox VE) users have been reporting crashes [1] with NULL
pointer dereferences in blk_flush_complete_seq. AFAICT, all systems
crashing in blk_flush_complete_seq used software RAID1, and ran our
downstream kernel based on (Ubuntu) kernel 6.8.

On mainline kernel 6.9, I can reproduce the crash [1] by:

* compiling the kernel with CONFIG_FAIL_MAKE_REQUEST enabled
* setting up root on software RAID1
* enabling the write-intent bitmap with `mdadm --grow --bitmap=internal
/dev/md0`. So far, I have not been able to reproduce the crash with the
write-intent bitmap disabled.
* injecting write faults via CONFIG_FAIL_MAKE_REQUEST on one of the
RAID1 disks
* generating IO in a KVM guest with fio
* waiting ~5 minutes

I've used this reproducer for a bisect, which produced

 81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine)

as the first commit with which I can reproduce the crashes. I'm not 100%
sure it is this one because the reproducer is a bit flaky. But it does
sound plausible, as the commit is included in our 6.8 kernel, and
touches `queuelist` which is AFAICT where blk_flush_complete_seq
dereferences the NULL pointer.

Does anyone have an idea what could be the cause for the crash, or how
to further debug this? Happy to provide more information if needed, as
well as the complete reproducer (I'd need to clean it up a little bit
first).

Do you think this could also affect setups without software RAID?

Best,

Friedrich

[1]

[  132.292488] BUG: kernel NULL pointer dereference, address:
0000000000000008
[  132.293594] #PF: supervisor write access in kernel mode
[  132.294408] #PF: error_code(0x0002) - not-present page
[  132.295187] PGD 0 P4D 0
[  132.295609] Oops: 0002 [#1] PREEMPT SMP NOPTI
[  132.296285] CPU: 0 PID: 776 Comm: kvm Tainted: G            E
6.5.0-rc2-bisect11+ #22
[  132.297556] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[  132.299292] RIP: 0010:blk_flush_complete_seq+0x296/0x2e0
[  132.300122] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2
04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73
48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b
[  132.302946] RSP: 0018:ffffa431407a39b8 EFLAGS: 00010046
[  132.303765] RAX: 0000000000000000 RBX: ffff92d00c2fd000 RCX:
ffff92d00c2fd048
[  132.304856] RDX: ffff92d0036dfaa0 RSI: ffff92d00c2fd048 RDI:
0000000000000000
[  132.305948] RBP: ffffa431407a39f8 R08: 0000000000000000 R09:
0000000000000000
[  132.307043] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000000801
[  132.308120] R13: ffff92d0036dfa80 R14: ffff92d0036dfa90 R15:
ffff92d00ba09e00
[  132.309216] FS:  0000727d191b86c0(0000) GS:ffff92d137c00000(0000)
knlGS:0000000000000000
[  132.310457] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  132.311355] CR2: 0000000000000008 CR3: 000000018f088006 CR4:
0000000000372ef0
[  132.312444] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[  132.313535] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[  132.314633] Call Trace:
[  132.315016]  <TASK>
[  132.315381]  ? show_regs+0x6d/0x80
[  132.315921]  ? __die+0x24/0x80
[  132.316421]  ? page_fault_oops+0x176/0x500
[  132.317049]  ? do_user_addr_fault+0x31d/0x6a0
[  132.317744]  ? exc_page_fault+0x83/0x1b0
[  132.318368]  ? asm_exc_page_fault+0x27/0x30
[  132.319008]  ? blk_flush_complete_seq+0x296/0x2e0
[  132.319759]  ? __blk_mq_alloc_requests+0x383/0x3a0
[  132.320505]  ? wbt_wait+0xb3/0x100
[  132.321041]  blk_insert_flush+0xd1/0x220
[  132.321664]  blk_mq_submit_bio+0x564/0x690
[  132.322294]  __submit_bio+0xb3/0x1c0
[  132.322859]  submit_bio_noacct_nocheck+0x2b7/0x390
[  132.323622]  submit_bio_noacct+0x18a/0x6b0
[  132.324241]  submit_bio+0x6c/0x80
[  132.324774]  md_super_write+0xd1/0x120
[  132.325376]  write_page+0x23b/0x3f0
[  132.325915]  ? md_bitmap_wait_writes+0xda/0xf0
[  132.326624]  md_bitmap_unplug+0x9f/0x140
[  132.327251]  flush_bio_list+0x102/0x110 [raid1]
[  132.327963]  raid1_unplug+0x3c/0xe0 [raid1]
[  132.328626]  __blk_flush_plug+0xc1/0x130
[  132.329238]  blk_finish_plug+0x31/0x50
[  132.329819]  io_submit_sqes+0x53e/0x680
[  132.330426]  __do_sys_io_uring_enter+0x59a/0xc10
[  132.331149]  ? vfs_read+0x20a/0x360
[  132.331699]  __x64_sys_io_uring_enter+0x22/0x40
[  132.332414]  do_syscall_64+0x58/0x90
[  132.332979]  ? ksys_read+0xe6/0x100
[  132.333530]  ? exit_to_user_mode_prepare+0x49/0x220
[  132.334292]  ? syscall_exit_to_user_mode+0x1b/0x50
[  132.335032]  ? do_syscall_64+0x67/0x90
[  132.335627]  ? syscall_exit_to_user_mode+0x1b/0x50
[  132.336381]  ? do_syscall_64+0x67/0x90
[  132.336958]  ? syscall_exit_to_user_mode+0x1b/0x50
[  132.337707]  ? do_syscall_64+0x67/0x90
[  132.338296]  ? syscall_exit_to_user_mode+0x1b/0x50
[  132.339045]  ? do_syscall_64+0x67/0x90
[  132.339646]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[  132.340426] RIP: 0033:0x727d26282b95
[  132.341011] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
[  132.343837] RSP: 002b:0000727d191b2fd8 EFLAGS: 00000246 ORIG_RAX:
00000000000001aa
[  132.344990] RAX: ffffffffffffffda RBX: 0000727d0c0039d0 RCX:
0000727d26282b95
[  132.346079] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
0000000000000033
[  132.347176] RBP: 0000727d0c0039d8 R08: 0000000000000000 R09:
0000000000000008
[  132.348272] R10: 0000000000000000 R11: 0000000000000246 R12:
0000727d0c003ac0
[  132.349376] R13: 0000000000000000 R14: 00005a5a2313bc68 R15:
00005a5a25cf0450
[  132.350472]  </TASK>
[  132.350828] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E)
cifs_arc4(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E)
ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E)
ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E)
sunrpc(E) softdog(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E)
nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_pmc_core(E)
kvm_intel(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) polyval_clmulni(E)
polyval_generic(E) ghash_clmulni_intel(E) aesni_intel(E) crypto_simd(E)
cryptd(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E)
mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E)
efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E)
autofs4(E) hid_generic(E) usbhid(E) hid(E) raid10(E) raid456(E)
async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E)
async_tx(E) xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E)
crc32_pclmul(E) bochs(E) drm_vram_helper(E) psmouse(E) drm_ttm_helper(E)
uhci_hcd(E) ttm(E)
[  132.350919]  ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E)
[  132.365204] CR2: 0000000000000008
[  132.365747] ---[ end trace 0000000000000000 ]---
[  132.366468] RIP: 0010:blk_flush_complete_seq+0x296/0x2e0
[  132.367290] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2
04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73
48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b
[  132.370119] RSP: 0018:ffffa431407a39b8 EFLAGS: 00010046
[  132.370935] RAX: 0000000000000000 RBX: ffff92d00c2fd000 RCX:
ffff92d00c2fd048
[  132.372015] RDX: ffff92d0036dfaa0 RSI: ffff92d00c2fd048 RDI:
0000000000000000
[  132.373115] RBP: ffffa431407a39f8 R08: 0000000000000000 R09:
0000000000000000
[  132.374193] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000000801
[  132.375286] R13: ffff92d0036dfa80 R14: ffff92d0036dfa90 R15:
ffff92d00ba09e00
[  132.376385] FS:  0000727d191b86c0(0000) GS:ffff92d137c00000(0000)
knlGS:0000000000000000
[  132.377600] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  132.378486] CR2: 0000000000000008 CR3: 000000018f088006 CR4:
0000000000372ef0
[  132.379564] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[  132.380638] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[  132.381738] note: kvm[776] exited with irqs disabled
[  132.382631] note: kvm[776] exited with preempt_count 1
[  132.383469] ------------[ cut here ]------------
[  132.384193] WARNING: CPU: 0 PID: 776 at kernel/exit.c:818
do_exit+0x8db/0xae0
[  132.385307] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E)
cifs_arc4(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E)
ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E)
ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E)
sunrpc(E) softdog(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E)
nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_pmc_core(E)
kvm_intel(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) polyval_clmulni(E)
polyval_generic(E) ghash_clmulni_intel(E) aesni_intel(E) crypto_simd(E)
cryptd(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E)
mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E)
efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E)
autofs4(E) hid_generic(E) usbhid(E) hid(E) raid10(E) raid456(E)
async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E)
async_tx(E) xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E)
crc32_pclmul(E) bochs(E) drm_vram_helper(E) psmouse(E) drm_ttm_helper(E)
uhci_hcd(E) ttm(E)
[  132.385396]  ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E)
[  132.399854] CPU: 0 PID: 776 Comm: kvm Tainted: G      D     E
6.5.0-rc2-bisect11+ #22
[  132.401165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[  132.402987] RIP: 0010:do_exit+0x8db/0xae0
[  132.403661] Code: e9 44 f8 ff ff 48 8b bb d8 09 00 00 31 f6 e8 fc d7
ff ff e9 f0 fd ff ff 4c 89 ee bf 05 06 00 00 e8 4a 2e 01 00 e9 70 f8 ff
ff <0f> 0b e9 9e f7 ff ff 0f 0b e9 57 f7 ff ff 48 89 df e8 bf 7d 13 00
[  132.406622] RSP: 0018:ffffa431407a3ec8 EFLAGS: 00010282
[  132.407487] RAX: 0000000000000000 RBX: ffff92d08d53a940 RCX:
0000000000000000
[  132.408637] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000000
[  132.409791] RBP: ffffa431407a3f20 R08: 0000000000000000 R09:
0000000000000000
[  132.410961] R10: 0000000000000000 R11: 0000000000000000 R12:
ffff92d0031cba80
[  132.412102] R13: 0000000000000009 R14: ffff92d090d53180 R15:
0000000000000000
[  132.413256] FS:  0000727d191b86c0(0000) GS:ffff92d137c00000(0000)
knlGS:0000000000000000
[  132.414582] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  132.415512] CR2: 0000000000000008 CR3: 000000018f088006 CR4:
0000000000372ef0
[  132.416670] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[  132.417842] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[  132.418998] Call Trace:
[  132.419430]  <TASK>
[  132.419788]  ? show_regs+0x6d/0x80
[  132.420352]  ? __warn+0x89/0x160
[  132.420891]  ? do_exit+0x8db/0xae0
[  132.421477]  ? report_bug+0x17e/0x1b0
[  132.422063]  ? handle_bug+0x46/0x90
[  132.422655]  ? exc_invalid_op+0x18/0x80
[  132.423299]  ? asm_exc_invalid_op+0x1b/0x20
[  132.423975]  ? do_exit+0x8db/0xae0
[  132.424565]  ? do_exit+0x72/0xae0
[  132.425131]  ? _printk+0x60/0x90
[  132.425666]  make_task_dead+0x86/0x180
[  132.426297]  rewind_stack_and_make_dead+0x17/0x20
[  132.427056] RIP: 0033:0x727d26282b95
[  132.427681] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
[  132.430663] RSP: 002b:0000727d191b2fd8 EFLAGS: 00000246 ORIG_RAX:
00000000000001aa
[  132.431895] RAX: ffffffffffffffda RBX: 0000727d0c0039d0 RCX:
0000727d26282b95
[  132.433049] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
0000000000000033
[  132.434203] RBP: 0000727d0c0039d8 R08: 0000000000000000 R09:
0000000000000008
[  132.435372] R10: 0000000000000000 R11: 0000000000000246 R12:
0000727d0c003ac0
[  132.436529] R13: 0000000000000000 R14: 00005a5a2313bc68 R15:
00005a5a25cf0450
[  132.437685]  </TASK>
[  132.438056] ---[ end trace 0000000000000000 ]---
Chengming Zhou May 27, 2024, 5:09 a.m. UTC | #2
Hi,

On 2024/5/25 00:07, Friedrich Weber wrote:
> Hi,
> 
> some of our (Proxmox VE) users have been reporting crashes [1] with NULL
> pointer dereferences in blk_flush_complete_seq. AFAICT, all systems
> crashing in blk_flush_complete_seq used software RAID1, and ran our
> downstream kernel based on (Ubuntu) kernel 6.8.
> 
> On mainline kernel 6.9, I can reproduce the crash [1] by:
> 
> * compiling the kernel with CONFIG_FAIL_MAKE_REQUEST enabled
> * setting up root on software RAID1
> * enabling the write-intent bitmap with `mdadm --grow --bitmap=internal
> /dev/md0`. So far, I have not been able to reproduce the crash with the
> write-intent bitmap disabled.
> * injecting write faults via CONFIG_FAIL_MAKE_REQUEST on one of the
> RAID1 disks
> * generating IO in a KVM guest with fio
> * waiting ~5 minutes
> 
> I've used this reproducer for a bisect, which produced
> 
>  81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine)
> 
> as the first commit with which I can reproduce the crashes. I'm not 100%
> sure it is this one because the reproducer is a bit flaky. But it does
> sound plausible, as the commit is included in our 6.8 kernel, and
> touches `queuelist` which is AFAICT where blk_flush_complete_seq
> dereferences the NULL pointer.

Ok, it will be better that I can reproduce it locally, will try later.

> 
> Does anyone have an idea what could be the cause for the crash, or how
> to further debug this? Happy to provide more information if needed, as
> well as the complete reproducer (I'd need to clean it up a little bit
> first).

BUG shows it panic on 0000000000000008, not sure what it's accessing then,
does it means rq->queuelist.next == 0 or something? Could you use add2line
to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0
and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;",
obviously fq can't be NULL. (I'm using the v6.9 kernel)

Thanks.

> 
> Do you think this could also affect setups without software RAID?
> 
> Best,
> 
> Friedrich
> 
> [1]
> 
> [  132.292488] BUG: kernel NULL pointer dereference, address:
> 0000000000000008
> [  132.293594] #PF: supervisor write access in kernel mode
> [  132.294408] #PF: error_code(0x0002) - not-present page
> [  132.295187] PGD 0 P4D 0
> [  132.295609] Oops: 0002 [#1] PREEMPT SMP NOPTI
> [  132.296285] CPU: 0 PID: 776 Comm: kvm Tainted: G            E
> 6.5.0-rc2-bisect11+ #22
> [  132.297556] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> [  132.299292] RIP: 0010:blk_flush_complete_seq+0x296/0x2e0
> [  132.300122] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2
> 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73
> 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b
> [  132.302946] RSP: 0018:ffffa431407a39b8 EFLAGS: 00010046
> [  132.303765] RAX: 0000000000000000 RBX: ffff92d00c2fd000 RCX:
> ffff92d00c2fd048
> [  132.304856] RDX: ffff92d0036dfaa0 RSI: ffff92d00c2fd048 RDI:
> 0000000000000000
> [  132.305948] RBP: ffffa431407a39f8 R08: 0000000000000000 R09:
> 0000000000000000
> [  132.307043] R10: 0000000000000000 R11: 0000000000000000 R12:
> 0000000000000801
> [  132.308120] R13: ffff92d0036dfa80 R14: ffff92d0036dfa90 R15:
> ffff92d00ba09e00
> [  132.309216] FS:  0000727d191b86c0(0000) GS:ffff92d137c00000(0000)
> knlGS:0000000000000000
> [  132.310457] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  132.311355] CR2: 0000000000000008 CR3: 000000018f088006 CR4:
> 0000000000372ef0
> [  132.312444] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [  132.313535] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [  132.314633] Call Trace:
> [  132.315016]  <TASK>
> [  132.315381]  ? show_regs+0x6d/0x80
> [  132.315921]  ? __die+0x24/0x80
> [  132.316421]  ? page_fault_oops+0x176/0x500
> [  132.317049]  ? do_user_addr_fault+0x31d/0x6a0
> [  132.317744]  ? exc_page_fault+0x83/0x1b0
> [  132.318368]  ? asm_exc_page_fault+0x27/0x30
> [  132.319008]  ? blk_flush_complete_seq+0x296/0x2e0
> [  132.319759]  ? __blk_mq_alloc_requests+0x383/0x3a0
> [  132.320505]  ? wbt_wait+0xb3/0x100
> [  132.321041]  blk_insert_flush+0xd1/0x220
> [  132.321664]  blk_mq_submit_bio+0x564/0x690
> [  132.322294]  __submit_bio+0xb3/0x1c0
> [  132.322859]  submit_bio_noacct_nocheck+0x2b7/0x390
> [  132.323622]  submit_bio_noacct+0x18a/0x6b0
> [  132.324241]  submit_bio+0x6c/0x80
> [  132.324774]  md_super_write+0xd1/0x120
> [  132.325376]  write_page+0x23b/0x3f0
> [  132.325915]  ? md_bitmap_wait_writes+0xda/0xf0
> [  132.326624]  md_bitmap_unplug+0x9f/0x140
> [  132.327251]  flush_bio_list+0x102/0x110 [raid1]
> [  132.327963]  raid1_unplug+0x3c/0xe0 [raid1]
> [  132.328626]  __blk_flush_plug+0xc1/0x130
> [  132.329238]  blk_finish_plug+0x31/0x50
> [  132.329819]  io_submit_sqes+0x53e/0x680
> [  132.330426]  __do_sys_io_uring_enter+0x59a/0xc10
> [  132.331149]  ? vfs_read+0x20a/0x360
> [  132.331699]  __x64_sys_io_uring_enter+0x22/0x40
> [  132.332414]  do_syscall_64+0x58/0x90
> [  132.332979]  ? ksys_read+0xe6/0x100
> [  132.333530]  ? exit_to_user_mode_prepare+0x49/0x220
> [  132.334292]  ? syscall_exit_to_user_mode+0x1b/0x50
> [  132.335032]  ? do_syscall_64+0x67/0x90
> [  132.335627]  ? syscall_exit_to_user_mode+0x1b/0x50
> [  132.336381]  ? do_syscall_64+0x67/0x90
> [  132.336958]  ? syscall_exit_to_user_mode+0x1b/0x50
> [  132.337707]  ? do_syscall_64+0x67/0x90
> [  132.338296]  ? syscall_exit_to_user_mode+0x1b/0x50
> [  132.339045]  ? do_syscall_64+0x67/0x90
> [  132.339646]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
> [  132.340426] RIP: 0033:0x727d26282b95
> [  132.341011] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
> d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
> 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
> [  132.343837] RSP: 002b:0000727d191b2fd8 EFLAGS: 00000246 ORIG_RAX:
> 00000000000001aa
> [  132.344990] RAX: ffffffffffffffda RBX: 0000727d0c0039d0 RCX:
> 0000727d26282b95
> [  132.346079] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> 0000000000000033
> [  132.347176] RBP: 0000727d0c0039d8 R08: 0000000000000000 R09:
> 0000000000000008
> [  132.348272] R10: 0000000000000000 R11: 0000000000000246 R12:
> 0000727d0c003ac0
> [  132.349376] R13: 0000000000000000 R14: 00005a5a2313bc68 R15:
> 00005a5a25cf0450
> [  132.350472]  </TASK>
> [  132.350828] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E)
> cifs_arc4(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E)
> ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E)
> ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E)
> sunrpc(E) softdog(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E)
> nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_pmc_core(E)
> kvm_intel(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) polyval_clmulni(E)
> polyval_generic(E) ghash_clmulni_intel(E) aesni_intel(E) crypto_simd(E)
> cryptd(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E)
> mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E)
> efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E)
> autofs4(E) hid_generic(E) usbhid(E) hid(E) raid10(E) raid456(E)
> async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E)
> async_tx(E) xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E)
> crc32_pclmul(E) bochs(E) drm_vram_helper(E) psmouse(E) drm_ttm_helper(E)
> uhci_hcd(E) ttm(E)
> [  132.350919]  ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E)
> [  132.365204] CR2: 0000000000000008
> [  132.365747] ---[ end trace 0000000000000000 ]---
> [  132.366468] RIP: 0010:blk_flush_complete_seq+0x296/0x2e0
> [  132.367290] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2
> 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73
> 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b
> [  132.370119] RSP: 0018:ffffa431407a39b8 EFLAGS: 00010046
> [  132.370935] RAX: 0000000000000000 RBX: ffff92d00c2fd000 RCX:
> ffff92d00c2fd048
> [  132.372015] RDX: ffff92d0036dfaa0 RSI: ffff92d00c2fd048 RDI:
> 0000000000000000
> [  132.373115] RBP: ffffa431407a39f8 R08: 0000000000000000 R09:
> 0000000000000000
> [  132.374193] R10: 0000000000000000 R11: 0000000000000000 R12:
> 0000000000000801
> [  132.375286] R13: ffff92d0036dfa80 R14: ffff92d0036dfa90 R15:
> ffff92d00ba09e00
> [  132.376385] FS:  0000727d191b86c0(0000) GS:ffff92d137c00000(0000)
> knlGS:0000000000000000
> [  132.377600] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  132.378486] CR2: 0000000000000008 CR3: 000000018f088006 CR4:
> 0000000000372ef0
> [  132.379564] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [  132.380638] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [  132.381738] note: kvm[776] exited with irqs disabled
> [  132.382631] note: kvm[776] exited with preempt_count 1
> [  132.383469] ------------[ cut here ]------------
> [  132.384193] WARNING: CPU: 0 PID: 776 at kernel/exit.c:818
> do_exit+0x8db/0xae0
> [  132.385307] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E)
> cifs_arc4(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E)
> ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E)
> ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E)
> sunrpc(E) softdog(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E)
> nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_pmc_core(E)
> kvm_intel(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) polyval_clmulni(E)
> polyval_generic(E) ghash_clmulni_intel(E) aesni_intel(E) crypto_simd(E)
> cryptd(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E)
> mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E)
> efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E)
> autofs4(E) hid_generic(E) usbhid(E) hid(E) raid10(E) raid456(E)
> async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E)
> async_tx(E) xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E)
> crc32_pclmul(E) bochs(E) drm_vram_helper(E) psmouse(E) drm_ttm_helper(E)
> uhci_hcd(E) ttm(E)
> [  132.385396]  ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E)
> [  132.399854] CPU: 0 PID: 776 Comm: kvm Tainted: G      D     E
> 6.5.0-rc2-bisect11+ #22
> [  132.401165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> [  132.402987] RIP: 0010:do_exit+0x8db/0xae0
> [  132.403661] Code: e9 44 f8 ff ff 48 8b bb d8 09 00 00 31 f6 e8 fc d7
> ff ff e9 f0 fd ff ff 4c 89 ee bf 05 06 00 00 e8 4a 2e 01 00 e9 70 f8 ff
> ff <0f> 0b e9 9e f7 ff ff 0f 0b e9 57 f7 ff ff 48 89 df e8 bf 7d 13 00
> [  132.406622] RSP: 0018:ffffa431407a3ec8 EFLAGS: 00010282
> [  132.407487] RAX: 0000000000000000 RBX: ffff92d08d53a940 RCX:
> 0000000000000000
> [  132.408637] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
> 0000000000000000
> [  132.409791] RBP: ffffa431407a3f20 R08: 0000000000000000 R09:
> 0000000000000000
> [  132.410961] R10: 0000000000000000 R11: 0000000000000000 R12:
> ffff92d0031cba80
> [  132.412102] R13: 0000000000000009 R14: ffff92d090d53180 R15:
> 0000000000000000
> [  132.413256] FS:  0000727d191b86c0(0000) GS:ffff92d137c00000(0000)
> knlGS:0000000000000000
> [  132.414582] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  132.415512] CR2: 0000000000000008 CR3: 000000018f088006 CR4:
> 0000000000372ef0
> [  132.416670] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [  132.417842] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [  132.418998] Call Trace:
> [  132.419430]  <TASK>
> [  132.419788]  ? show_regs+0x6d/0x80
> [  132.420352]  ? __warn+0x89/0x160
> [  132.420891]  ? do_exit+0x8db/0xae0
> [  132.421477]  ? report_bug+0x17e/0x1b0
> [  132.422063]  ? handle_bug+0x46/0x90
> [  132.422655]  ? exc_invalid_op+0x18/0x80
> [  132.423299]  ? asm_exc_invalid_op+0x1b/0x20
> [  132.423975]  ? do_exit+0x8db/0xae0
> [  132.424565]  ? do_exit+0x72/0xae0
> [  132.425131]  ? _printk+0x60/0x90
> [  132.425666]  make_task_dead+0x86/0x180
> [  132.426297]  rewind_stack_and_make_dead+0x17/0x20
> [  132.427056] RIP: 0033:0x727d26282b95
> [  132.427681] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
> d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
> 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
> [  132.430663] RSP: 002b:0000727d191b2fd8 EFLAGS: 00000246 ORIG_RAX:
> 00000000000001aa
> [  132.431895] RAX: ffffffffffffffda RBX: 0000727d0c0039d0 RCX:
> 0000727d26282b95
> [  132.433049] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> 0000000000000033
> [  132.434203] RBP: 0000727d0c0039d8 R08: 0000000000000000 R09:
> 0000000000000008
> [  132.435372] R10: 0000000000000000 R11: 0000000000000246 R12:
> 0000727d0c003ac0
> [  132.436529] R13: 0000000000000000 R14: 00005a5a2313bc68 R15:
> 00005a5a25cf0450
> [  132.437685]  </TASK>
> [  132.438056] ---[ end trace 0000000000000000 ]---
>
Friedrich Weber May 27, 2024, 4:04 p.m. UTC | #3
Hi Chengming,

Thank you for taking a look at this!

On 27/05/2024 07:09, Chengming Zhou wrote:
>> I've used this reproducer for a bisect, which produced
>>
>>  81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine)
>>
>> as the first commit with which I can reproduce the crashes. I'm not 100%
>> sure it is this one because the reproducer is a bit flaky. But it does
>> sound plausible, as the commit is included in our 6.8 kernel, and
>> touches `queuelist` which is AFAICT where blk_flush_complete_seq
>> dereferences the NULL pointer.
> 
> Ok, it will be better that I can reproduce it locally, will try later.

Interestingly, so far I haven't been able to reproduce the crash when
generating IO on the host itself, I only got crashes when generating IO
in a QEMU VM.

The reproducer in more detail:

- Compile Linux 6.9 with CONFIG_FAULT_INJECTION,
CONFIG_FAULT_INJECTION_DEBUG_FS, CONFIG_FAIL_MAKE_REQUEST and boot it on
the host

- On the host, I have root on ext4 on top of a RAID1 software raid, see
[0] for mdadm output. I have to enable the write-intent bitmap to
reproduce the crash.

- Run QEMU to start a Linux VM whose disk is backed by a qcow2 file on
root, see [1] for QEMU 8.2.2 command line. The VM is running Proxmox VE
here, but I don't think that matters.

- Run fio inside the VM as follows:

fio --name foo --size 1M --filename foo.bin --rw=write --bs=4k --iodepth
4 --ioengine libaio --numjobs 4 --time_based --runtime 600

- Inject faults on sdb1 on the host:

echo 1 > /sys/block/sdb/sdb1/make-it-fail
echo 1000 > /sys/kernel/debug/fail_make_request/times
echo 50 > /sys/kernel/debug/fail_make_request/interval
echo 5 > /sys/kernel/debug/fail_make_request/probability
echo 1 > /sys/kernel/debug/fail_make_request/verbose

- And usually (in somewhat 80% of cases) I get a host crash within ~5
minutes. Sometimes the software RAID declares sdb1 to be faulty and
disables it, in this case I have to try again.

>> Does anyone have an idea what could be the cause for the crash, or how
>> to further debug this? Happy to provide more information if needed, as
>> well as the complete reproducer (I'd need to clean it up a little bit
>> first).
> 
> BUG shows it panic on 0000000000000008, not sure what it's accessing then,
> does it means rq->queuelist.next == 0 or something? Could you use add2line
> to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0
> and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;",
> obviously fq can't be NULL. (I'm using the v6.9 kernel)

Sorry for the confusion, the crash dump was from a kernel compiled at
81ada09cc25e -- with 6.9, the offset seems to be different. See [2] for
a kernel 6.9 crash dump.

I don't know too much about kernel debugging, but I tried to get
something useful out of addr2line:

# addr2line -f -e /usr/lib/debug/vmlinux-6.9.0-debug2
blk_flush_complete_seq+0x291/0x2d0
__list_del
/[...]./include/linux/list.h:195

I tried to find the relevant portions in `objdump -SD blk-flush.o`, see
[3]. If I'm not mistaken, blk_flush_complete_seq+0x291 should point to

351:   48 89 4f 08             mov    %rcx,0x8(%rdi)

To me this looks like part of

	list_move_tail(&rq->queuelist, pending);

What do you think?

FWIW, I've also compiled a 6.9 kernel with CONFIG_LIST_DEBUG, and when
running the reproducer it started reporting corruptions, see [4] for the
first one. It did not crash, though, maybe because CONFIG_LIST_DEBUG
prevents the NULL pointer dereference?

Hope this helps! If I can provide anything else, just let me know.

Best wishes,

Friedrich

[0]

# mdadm --detail /dev/md0
/dev/md0:
           Version : 1.2
     Creation Time : Wed May 22 10:14:40 2024
        Raid Level : raid1
        Array Size : 33534976 (31.98 GiB 34.34 GB)
     Used Dev Size : 33534976 (31.98 GiB 34.34 GB)
      Raid Devices : 2
     Total Devices : 2
       Persistence : Superblock is persistent

     Intent Bitmap : Internal

       Update Time : Mon May 27 17:31:02 2024
             State : active
    Active Devices : 2
   Working Devices : 2
    Failed Devices : 0
     Spare Devices : 0

Consistency Policy : bitmap

              Name : reproflushfull:0  (local to host reproflushfull)
              UUID : fda4a959:d2dd0bef:d7094fb1:c6e3174c
            Events : 749

    Number   Major   Minor   RaidDevice State
       0       8        1        0      active sync   /dev/sda1
       1       8       17        1      active sync   /dev/sdb1

[1]

./qemu-8.2.2/qemu-system-x86_64 \
  -accel kvm \
  -name 'pve,debug-threads=on' \
  -chardev
'socket,id=qmp,path=/var/run/qemu-server/100.qmp,server=on,wait=off' \
  -mon 'chardev=qmp,mode=control' \
  -chardev 'socket,id=qmp-event,path=/var/run/qmeventd.sock,reconnect=5' \
  -mon 'chardev=qmp-event,mode=control' \
  -pidfile /var/run/qemu-server/100.pid \
  -smbios 'type=1,uuid=25a3012b-e72a-4121-9e57-f6f5a40ebe62' \
  -smp '4,sockets=1,cores=4,maxcpus=4' \
  -nodefaults \
  -vnc 'unix:/var/run/qemu-server/100.vnc,password=on' \
  -cpu host,+kvm_pv_eoi,+kvm_pv_unhalt \
  -m 4096 \
  -object 'iothread,id=iothread-virtioscsi0' \
  -device 'pci-bridge,id=pci.1,chassis_nr=1,bus=pci.0,addr=0x1e' \
  -device 'pci-bridge,id=pci.2,chassis_nr=2,bus=pci.0,addr=0x1f' \
  -device 'pci-bridge,id=pci.3,chassis_nr=3,bus=pci.0,addr=0x5' \
  -device 'vmgenid,guid=ab6d50e3-d574-4ba8-bb0c-3c0bbcc9cd2e' \
  -device 'piix3-usb-uhci,id=uhci,bus=pci.0,addr=0x1.0x2' \
  -device 'VGA,id=vga,bus=pci.0,addr=0x2' \
  -device
'virtio-scsi-pci,id=virtioscsi0,bus=pci.3,addr=0x1,iothread=iothread-virtioscsi0'
\
  -drive
'file=/var/lib/vz/images/100/vm-100-disk-0.qcow2,if=none,id=drive-scsi0,format=qcow2,cache=none,aio=io_uring,detect-zeroes=on'
\
  -device
'scsi-hd,bus=virtioscsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0,id=scsi0,bootindex=100'
\
  -machine 'type=pc'

[2]

[   85.282301] BUG: kernel NULL pointer dereference, address:
0000000000000008
[   85.283454] #PF: supervisor write access in kernel mode
[   85.284289] #PF: error_code(0x0002) - not-present page
[   85.285087] PGD 0 P4D 0
[   85.285500] Oops: 0002 [#1] PREEMPT SMP NOPTI
[   85.286211] CPU: 0 PID: 794 Comm: kvm Tainted: G            E
6.9.0-debug2 #27
[   85.287420] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[   85.289177] RIP: 0010:blk_flush_complete_seq+0x291/0x2d0
[   85.289994] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2
04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73
48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b
[   85.292894] RSP: 0018:ffff9c08c07f3998 EFLAGS: 00010046
[   85.293714] RAX: 0000000000000000 RBX: ffff8b9c8abad000 RCX:
ffff8b9c8abad048
[   85.294809] RDX: ffff8b9c80a3d6a0 RSI: ffff8b9c8abad048 RDI:
0000000000000000
[   85.295921] RBP: ffff9c08c07f39d8 R08: 0000000000000000 R09:
0000000000000000
[   85.297111] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000009801
[   85.298211] R13: ffff8b9c80a3d680 R14: ffff8b9c80a3d690 R15:
ffff8b9c8341ddc0
[   85.299353] FS:  0000776683e006c0(0000) GS:ffff8b9db7a00000(0000)
knlGS:0000000000000000
[   85.300825] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   85.301712] CR2: 0000000000000008 CR3: 000000010e6e4001 CR4:
0000000000372ef0
[   85.302810] Call Trace:
[   85.303242]  <TASK>
[   85.303597]  ? show_regs+0x6c/0x80
[   85.304169]  ? __die+0x24/0x80
[   85.304671]  ? page_fault_oops+0x175/0x5b0
[   85.305326]  ? do_user_addr_fault+0x311/0x680
[   85.306012]  ? exc_page_fault+0x82/0x1b0
[   85.306657]  ? asm_exc_page_fault+0x27/0x30
[   85.307297]  ? blk_flush_complete_seq+0x291/0x2d0
[   85.308023]  ? __blk_mq_alloc_requests+0x251/0x540
[   85.308785]  ? wbt_wait+0x33/0x100
[   85.309620]  blk_insert_flush+0xce/0x220
[   85.310307]  blk_mq_submit_bio+0x5cd/0x730
[   85.310948]  __submit_bio+0xb3/0x1c0
[   85.311535]  submit_bio_noacct_nocheck+0x2fc/0x3d0
[   85.312296]  submit_bio_noacct+0x1ba/0x6b0
[   85.312934]  ? ext4_file_write_iter+0x3b1/0x7e0
[   85.313663]  submit_bio+0xb1/0x110
[   85.314220]  md_super_write+0xcf/0x110
[   85.314816]  write_sb_page+0x148/0x300
[   85.315422]  filemap_write_page+0x5b/0x70
[   85.316058]  md_bitmap_unplug+0x99/0x1f0
[   85.316676]  flush_bio_list+0x107/0x110 [raid1]
[   85.317399]  raid1_unplug+0x3c/0xf0 [raid1]
[   85.318056]  __blk_flush_plug+0xbe/0x140
[   85.318673]  blk_finish_plug+0x30/0x50
[   85.319268]  io_submit_sqes+0x4c4/0x6c0
[   85.319872]  __do_sys_io_uring_enter+0x2f2/0x640
[   85.320590]  ? blk_mq_complete_request+0x26/0x40
[   85.321326]  __x64_sys_io_uring_enter+0x22/0x40
[   85.322044]  x64_sys_call+0x20b9/0x24b0
[   85.322656]  do_syscall_64+0x80/0x170
[   85.323243]  ? vring_interrupt+0x94/0x110
[   85.323878]  ? __x64_sys_ppoll+0xf2/0x170
[   85.324501]  ? handle_irq_event+0x52/0x80
[   85.325103]  ? _raw_spin_unlock+0xe/0x40
[   85.325728]  ? handle_edge_irq+0xda/0x250
[   85.326347]  ? irqentry_exit_to_user_mode+0x76/0x270
[   85.327114]  ? irqentry_exit+0x43/0x50
[   85.327703]  ? clear_bhb_loop+0x15/0x70
[   85.328286]  ? clear_bhb_loop+0x15/0x70
[   85.328897]  ? clear_bhb_loop+0x15/0x70
[   85.329541]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   85.330326] RIP: 0033:0x776695660b95
[   85.330908] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
[   85.333736] RSP: 002b:0000776683dfafd8 EFLAGS: 00000246 ORIG_RAX:
00000000000001aa
[   85.334911] RAX: ffffffffffffffda RBX: 000077667c0039f0 RCX:
0000776695660b95
[   85.335999] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
0000000000000033
[   85.337111] RBP: 000077667c0039f8 R08: 0000000000000000 R09:
0000000000000008
[   85.338210] R10: 0000000000000000 R11: 0000000000000246 R12:
000077667c003ae0
[   85.339304] R13: 0000000000000000 R14: 000064175d5afc68 R15:
000064178a3f9450
[   85.340408]  </TASK>
[   85.340772] Modules linked in: tcp_diag(E) inet_diag(E) veth(E)
cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E)
iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E)
ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E)
ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E)
binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E)
intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E)
intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E)
kvm_intel(E) kvm(E) hid_generic(E) crct10dif_pclmul(E)
polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E)
sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E)
cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E)
serio_raw(E) mac_hid(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E)
efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E)
autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E)
async_pq(E) async_xor(E) async_tx(E)
[   85.340859]  xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E)
crc32_pclmul(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) psmouse(E)
uhci_hcd(E) ehci_hcd(E) ttm(E) i2c_piix4(E) pata_acpi(E) floppy(E)
[   85.357137] CR2: 0000000000000008
[   85.358209] ---[ end trace 0000000000000000 ]---
[   85.358926] RIP: 0010:blk_flush_complete_seq+0x291/0x2d0
[   85.360004] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2
04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73
48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b
[   85.362891] RSP: 0018:ffff9c08c07f3998 EFLAGS: 00010046
[   85.363695] RAX: 0000000000000000 RBX: ffff8b9c8abad000 RCX:
ffff8b9c8abad048
[   85.364800] RDX: ffff8b9c80a3d6a0 RSI: ffff8b9c8abad048 RDI:
0000000000000000
[   85.365902] RBP: ffff9c08c07f39d8 R08: 0000000000000000 R09:
0000000000000000
[   85.366993] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000009801
[   85.368089] R13: ffff8b9c80a3d680 R14: ffff8b9c80a3d690 R15:
ffff8b9c8341ddc0
[   85.369190] FS:  0000776683e006c0(0000) GS:ffff8b9db7a00000(0000)
knlGS:0000000000000000
[   85.370440] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   85.371327] CR2: 0000000000000008 CR3: 000000010e6e4001 CR4:
0000000000372ef0
[   85.372440] note: kvm[794] exited with irqs disabled
[   85.373340] note: kvm[794] exited with preempt_count 1
[   85.374238] ------------[ cut here ]------------
[   85.374943] WARNING: CPU: 0 PID: 794 at kernel/exit.c:827
do_exit+0x8a2/0xab0
[   85.376047] Modules linked in: tcp_diag(E) inet_diag(E) veth(E)
cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E)
iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E)
ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E)
ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E)
binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E)
intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E)
intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E)
kvm_intel(E) kvm(E) hid_generic(E) crct10dif_pclmul(E)
polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E)
sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E)
cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E)
serio_raw(E) mac_hid(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E)
efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E)
autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E)
async_pq(E) async_xor(E) async_tx(E)
[   85.376128]  xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E)
crc32_pclmul(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) psmouse(E)
uhci_hcd(E) ehci_hcd(E) ttm(E) i2c_piix4(E) pata_acpi(E) floppy(E)
[   85.392395] CPU: 0 PID: 794 Comm: kvm Tainted: G      D     E
6.9.0-debug2 #27
[   85.393594] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[   85.395347] RIP: 0010:do_exit+0x8a2/0xab0
[   85.395975] Code: 38 0c 00 00 e9 2d fe ff ff 49 8d 7d 18 e8 b6 83 07
00 e9 5a f8 ff ff 48 8b bb f8 09 00 00 31 f6 e8 b3 e0 ff ff e9 c1 fd ff
ff <0f> 0b e9 d4 f7 ff ff 0f 0b e9 8f f7 ff ff 4c 89 e6 bf 05 06 00 00
[   85.398793] RSP: 0018:ffff9c08c07f3ec8 EFLAGS: 00010286
[   85.399625] RAX: 0000000000000000 RBX: ffff8b9c82e90000 RCX:
0000000000000000
[   85.400720] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000000

[3]

00000000000000c0 <blk_flush_complete_seq>:
{
[...]
       struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 32a:   49 8d 56 01             lea    0x1(%r14),%rdx
 32e:   49 c1 e6 04             shl    $0x4,%r14
 332:   4d 01 ee                add    %r13,%r14
 335:   48 c1 e2 04             shl    $0x4,%rdx
        return READ_ONCE(head->next) == head;
 339:   49 8b 4e 10             mov    0x10(%r14),%rcx
 33d:   4c 01 ea                add    %r13,%rdx
                if (list_empty(pending))
 340:   48 39 ca                cmp    %rcx,%rdx
 343:   74 2b                   je     370 <blk_flush_complete_seq+0x2b0>
        __list_del(entry->prev, entry->next);
 345:   48 8b 4b 50             mov    0x50(%rbx),%rcx
 349:   48 8b 7b 48             mov    0x48(%rbx),%rdi
                list_move_tail(&rq->queuelist, pending);
 34d:   48 8d 73 48             lea    0x48(%rbx),%rsi
        next->prev = prev;
 351:   48 89 4f 08             mov    %rcx,0x8(%rdi)
        WRITE_ONCE(prev->next, next);
 355:   48 89 39                mov    %rdi,(%rcx)
        __list_add(new, head->prev, head);
 358:   49 8b 4e 18             mov    0x18(%r14),%rcx
        next->prev = new;
 35c:   49 89 76 18             mov    %rsi,0x18(%r14)
        new->next = next;
 360:   48 89 53 48             mov    %rdx,0x48(%rbx)
        new->prev = prev;
 364:   48 89 4b 50             mov    %rcx,0x50(%rbx)
        WRITE_ONCE(prev->next, new);
 368:   48 89 31                mov    %rsi,(%rcx)
}
 36b:   e9 30 fe ff ff          jmp    1a0 <blk_flush_complete_seq+0xe0>
                        fq->flush_pending_since = jiffies;
 370:   48 8b 0d 00 00 00 00    mov    0x0(%rip),%rcx        # 377
<blk_flush_complete_seq+0x2b7>
 377:   49 89 4d 08             mov    %rcx,0x8(%r13)
 37b:   eb c8                   jmp    345 <blk_flush_complete_seq+0x285>

[4] May 27 11:12:53 reproflushfull kernel: list_del corruption.
prev->next should be ffff8ad44ab63788, but was ffff8ad44ab63400.
(prev=ffff8ad44ab63788)
May 27 11:12:53 reproflushfull kernel: WARNING: CPU: 0 PID: 797 at
lib/list_debug.c:62 __list_del_entry_valid_or_report+0xd4/0x100
May 27 11:12:53 reproflushfull kernel: Modules linked in: veth(E)
cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E)
iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E)
ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E)
ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E)
binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E)
intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E)
intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E)
kvm_intel(E) kvm(E) crct10dif_pclmul(E) polyval_clmulni(E)
polyval_generic(E) hid_generic(E) ghash_clmulni_intel(E) sha256_ssse3(E)
sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E) cryptd(E) hid(E)
rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E)
serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E)
dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E)
raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E)
async_xor(E) async_tx(E) xor(E) raid6_pq(E)
May 27 11:12:53 reproflushfull kernel:  libcrc32c(E) raid0(E) raid1(E)
crc32_pclmul(E) bochs(E) psmouse(E) drm_vram_helper(E) drm_ttm_helper(E)
uhci_hcd(E) ttm(E) i2c_piix4(E) ehci_hcd(E) pata_acpi(E) floppy(E)
May 27 11:12:53 reproflushfull kernel: CPU: 0 PID: 797 Comm: kvm
Tainted: G            E      6.9.0-debug #25
May 27 11:12:53 reproflushfull kernel: Hardware name: QEMU Standard PC
(i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org
04/01/2014
May 27 11:12:53 reproflushfull kernel: RIP:
0010:__list_del_entry_valid_or_report+0xd4/0x100
May 27 11:12:53 reproflushfull kernel: Code: 48 89 fe 48 89 ca 48 c7 c7
78 77 9f a1 e8 84 3a 8e ff 0f 0b 31 c0 eb b0 48 89 fe 48 89 c2 48 c7 c7
b0 77 9f a1 e8 6c 3a 8e ff <0f> 0b 31 c0 eb 98 48 89 d1 48 c7 c7 f8 77
9f a1 48 89 f2 48 89 c6
May 27 11:12:53 reproflushfull kernel: RSP: 0018:ffffb1dd80673740
EFLAGS: 00010046
May 27 11:12:53 reproflushfull kernel: RAX: 0000000000000000 RBX:
ffff8ad440be12c0 RCX: 0000000000000000
May 27 11:12:53 reproflushfull kernel: RDX: 0000000000000000 RSI:
0000000000000000 RDI: 0000000000000000
May 27 11:12:53 reproflushfull kernel: RBP: ffffb1dd80673740 R08:
0000000000000000 R09: 0000000000000000
May 27 11:12:53 reproflushfull kernel: R10: 0000000000000000 R11:
0000000000000000 R12: ffff8ad44a8c1e00
May 27 11:12:53 reproflushfull kernel: R13: ffff8ad440be12c0 R14:
ffff8ad44ab63788 R15: ffff8ad44ab63740
May 27 11:12:53 reproflushfull kernel: FS:  00007e8a67c006c0(0000)
GS:ffff8ad577a00000(0000) knlGS:0000000000000000
May 27 11:12:53 reproflushfull kernel: CS:  0010 DS: 0000 ES: 0000 CR0:
0000000080050033
May 27 11:12:53 reproflushfull kernel: CR2: 00007e8a5801a008 CR3:
000000010e022001 CR4: 0000000000372ef0
May 27 11:12:53 reproflushfull kernel: Call Trace:
May 27 11:12:53 reproflushfull kernel:  <TASK>
May 27 11:12:53 reproflushfull kernel:  ? show_regs+0x6c/0x80
May 27 11:12:53 reproflushfull kernel:  ? __warn+0x88/0x140
May 27 11:12:53 reproflushfull kernel:  ?
__list_del_entry_valid_or_report+0xd4/0x100
May 27 11:12:53 reproflushfull kernel:  ? report_bug+0x182/0x1b0
May 27 11:12:53 reproflushfull kernel:  ? handle_bug+0x46/0x90
May 27 11:12:53 reproflushfull kernel:  ? exc_invalid_op+0x18/0x80
May 27 11:12:53 reproflushfull kernel:  ? asm_exc_invalid_op+0x1b/0x20
May 27 11:12:53 reproflushfull kernel:  ?
__list_del_entry_valid_or_report+0xd4/0x100
May 27 11:12:53 reproflushfull kernel:  blk_flush_complete_seq+0x2f3/0x360
May 27 11:12:53 reproflushfull kernel:  ? wbt_wait+0x33/0x100
May 27 11:12:53 reproflushfull kernel:  blk_insert_flush+0xce/0x220
May 27 11:12:53 reproflushfull kernel:  blk_mq_submit_bio+0x5cd/0x730
May 27 11:12:53 reproflushfull kernel:  __submit_bio+0xb3/0x1c0
May 27 11:12:53 reproflushfull kernel:
submit_bio_noacct_nocheck+0x2fc/0x3d0
May 27 11:12:53 reproflushfull kernel:  submit_bio_noacct+0x1ba/0x6b0
May 27 11:12:53 reproflushfull kernel:  submit_bio+0xb1/0x110
May 27 11:12:53 reproflushfull kernel:  md_super_write+0xcf/0x110
May 27 11:12:53 reproflushfull kernel:  write_sb_page+0x148/0x300
May 27 11:12:53 reproflushfull kernel:  filemap_write_page+0x5b/0x70
May 27 11:12:53 reproflushfull kernel:  md_bitmap_unplug+0x99/0x1f0
May 27 11:12:53 reproflushfull kernel:  flush_bio_list+0x107/0x110 [raid1]
May 27 11:12:53 reproflushfull kernel:  raid1_unplug+0x3c/0xf0 [raid1]
May 27 11:12:53 reproflushfull kernel:  __blk_flush_plug+0xd9/0x170
May 27 11:12:53 reproflushfull kernel:  blk_finish_plug+0x30/0x50
May 27 11:12:53 reproflushfull kernel:  io_submit_sqes+0x4c4/0x6c0
May 27 11:12:53 reproflushfull kernel:  __do_sys_io_uring_enter+0x2f2/0x640
May 27 11:12:53 reproflushfull kernel:  __x64_sys_io_uring_enter+0x22/0x40
May 27 11:12:53 reproflushfull kernel:  x64_sys_call+0x20b9/0x24b0
May 27 11:12:53 reproflushfull kernel:  do_syscall_64+0x80/0x170
May 27 11:12:53 reproflushfull kernel:  ? do_mprotect_pkey+0x198/0x620
May 27 11:12:53 reproflushfull kernel:  ?
__memcg_slab_post_alloc_hook+0x18e/0x230
May 27 11:12:53 reproflushfull kernel:  ? policy_nodemask+0x145/0x180
May 27 11:12:53 reproflushfull kernel:  ?
__mod_memcg_lruvec_state+0x87/0x120
May 27 11:12:53 reproflushfull kernel:  ? __mod_lruvec_state+0x36/0x50
May 27 11:12:53 reproflushfull kernel:  ? __lruvec_stat_mod_folio+0x70/0xc0
May 27 11:12:53 reproflushfull kernel:  ? set_ptes.constprop.0+0x2b/0xb0
May 27 11:12:53 reproflushfull kernel:  ? _raw_spin_unlock+0xe/0x40
May 27 11:12:53 reproflushfull kernel:  ? do_anonymous_page+0x23d/0x790
May 27 11:12:53 reproflushfull kernel:  ? __pte_offset_map+0x1c/0x1b0
May 27 11:12:53 reproflushfull kernel:  ? __handle_mm_fault+0xc1a/0xe90
May 27 11:12:53 reproflushfull kernel:  ? do_syscall_64+0x8c/0x170
May 27 11:12:53 reproflushfull kernel:  ? __count_memcg_events+0x6f/0xe0
May 27 11:12:53 reproflushfull kernel:  ?
count_memcg_events.constprop.0+0x2a/0x50
May 27 11:12:53 reproflushfull kernel:  ? handle_mm_fault+0xaf/0x340
May 27 11:12:53 reproflushfull kernel:  ? do_user_addr_fault+0x365/0x680
May 27 11:12:53 reproflushfull kernel:  ?
irqentry_exit_to_user_mode+0x76/0x270
May 27 11:12:53 reproflushfull kernel:  ? irqentry_exit+0x43/0x50
May 27 11:12:53 reproflushfull kernel:  ? clear_bhb_loop+0x15/0x70
May 27 11:12:53 reproflushfull kernel:  ? clear_bhb_loop+0x15/0x70
May 27 11:12:53 reproflushfull kernel:  ? clear_bhb_loop+0x15/0x70
May 27 11:12:53 reproflushfull kernel:
entry_SYSCALL_64_after_hwframe+0x76/0x7e
May 27 11:12:53 reproflushfull kernel: RIP: 0033:0x7e8a75069b95
May 27 11:12:53 reproflushfull kernel: Code: 00 00 00 44 89 d0 41 b9 08
00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45
31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2
02 74 c2 f0 48 83 0c 24
May 27 11:12:53 reproflushfull kernel: RSP: 002b:00007e8a67bfafd8
EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
May 27 11:12:53 reproflushfull kernel: RAX: ffffffffffffffda RBX:
00007e8a580039f0 RCX: 00007e8a75069b95
May 27 11:12:53 reproflushfull kernel: RDX: 0000000000000000 RSI:
0000000000000020 RDI: 0000000000000033
May 27 11:12:53 reproflushfull kernel: RBP: 00007e8a580039f8 R08:
0000000000000000 R09: 0000000000000008
May 27 11:12:53 reproflushfull kernel: R10: 0000000000000000 R11:
0000000000000246 R12: 00007e8a58003ae0
May 27 11:12:53 reproflushfull kernel: R13: 0000000000000000 R14:
000064a145ca1c68 R15: 000064a183935450
May 27 11:12:53 reproflushfull kernel:  </TASK>
May 27 11:12:53 reproflushfull kernel: ---[ end trace 0000000000000000 ]---
Chengming Zhou May 27, 2024, 11:34 p.m. UTC | #4
On 2024/5/28 00:04, Friedrich Weber wrote:
> Hi Chengming,
> 
> Thank you for taking a look at this!
> 
> On 27/05/2024 07:09, Chengming Zhou wrote:
>>> I've used this reproducer for a bisect, which produced
>>>
>>>  81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine)
>>>
>>> as the first commit with which I can reproduce the crashes. I'm not 100%
>>> sure it is this one because the reproducer is a bit flaky. But it does
>>> sound plausible, as the commit is included in our 6.8 kernel, and
>>> touches `queuelist` which is AFAICT where blk_flush_complete_seq
>>> dereferences the NULL pointer.
>>
>> Ok, it will be better that I can reproduce it locally, will try later.
> 
> Interestingly, so far I haven't been able to reproduce the crash when
> generating IO on the host itself, I only got crashes when generating IO
> in a QEMU VM.
> 
> The reproducer in more detail:

Thanks for these details, I will try to setup and reproduce when I back to work.

> 
> - Compile Linux 6.9 with CONFIG_FAULT_INJECTION,
[...]
>>
>> BUG shows it panic on 0000000000000008, not sure what it's accessing then,
>> does it means rq->queuelist.next == 0 or something? Could you use add2line
>> to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0
>> and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;",
>> obviously fq can't be NULL. (I'm using the v6.9 kernel)
> 
> Sorry for the confusion, the crash dump was from a kernel compiled at
> 81ada09cc25e -- with 6.9, the offset seems to be different. See [2] for
> a kernel 6.9 crash dump.
> 
> I don't know too much about kernel debugging, but I tried to get
> something useful out of addr2line:
> 
> # addr2line -f -e /usr/lib/debug/vmlinux-6.9.0-debug2
> blk_flush_complete_seq+0x291/0x2d0
> __list_del
> /[...]./include/linux/list.h:195
> 
> I tried to find the relevant portions in `objdump -SD blk-flush.o`, see
> [3]. If I'm not mistaken, blk_flush_complete_seq+0x291 should point to
> 
> 351:   48 89 4f 08             mov    %rcx,0x8(%rdi)
> 
> To me this looks like part of
> 
> 	list_move_tail(&rq->queuelist, pending);
> 
> What do you think?

Yeah, it seems correct, so the rq->queuelist.next == NULL. It can't be NULL
if went through REQ_FSEQ_POSTFLUSH, so it must be REQ_FSEQ_PREFLUSH. It means
we allocated a request but its queuelist is not initialized or corrupted?

Anyway, I will use below changes for debugging when reproduce, and you could
also try this to see if we could get something useful. :)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b4df8e5ac9e..6e3a6cd7739d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2989,6 +2989,8 @@ void blk_mq_submit_bio(struct bio *bio)
                blk_mq_use_cached_rq(rq, plug, bio);
        }

+       BUG_ON(rq->queuelist.next == NULL);
+
        trace_block_getrq(bio);

        rq_qos_track(q, rq, bio);
@@ -3006,6 +3008,8 @@ void blk_mq_submit_bio(struct bio *bio)
        if (bio_zone_write_plugging(bio))
                blk_zone_write_plug_init_request(rq);

+       BUG_ON(rq->queuelist.next == NULL);
+
        if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
                return;

Thanks!

> 
> FWIW, I've also compiled a 6.9 kernel with CONFIG_LIST_DEBUG, and when
> running the reproducer it started reporting corruptions, see [4] for the
> first one. It did not crash, though, maybe because CONFIG_LIST_DEBUG
> prevents the NULL pointer dereference?
> 
> Hope this helps! If I can provide anything else, just let me know.
> 
> Best wishes,
> 
> Friedrich
> 
> [0]
> 
> # mdadm --detail /dev/md0
> /dev/md0:
>            Version : 1.2
>      Creation Time : Wed May 22 10:14:40 2024
>         Raid Level : raid1
>         Array Size : 33534976 (31.98 GiB 34.34 GB)
>      Used Dev Size : 33534976 (31.98 GiB 34.34 GB)
>       Raid Devices : 2
>      Total Devices : 2
>        Persistence : Superblock is persistent
> 
>      Intent Bitmap : Internal
> 
>        Update Time : Mon May 27 17:31:02 2024
>              State : active
>     Active Devices : 2
>    Working Devices : 2
>     Failed Devices : 0
>      Spare Devices : 0
> 
> Consistency Policy : bitmap
> 
>               Name : reproflushfull:0  (local to host reproflushfull)
>               UUID : fda4a959:d2dd0bef:d7094fb1:c6e3174c
>             Events : 749
> 
>     Number   Major   Minor   RaidDevice State
>        0       8        1        0      active sync   /dev/sda1
>        1       8       17        1      active sync   /dev/sdb1
> 
> [1]
> 
> ./qemu-8.2.2/qemu-system-x86_64 \
>   -accel kvm \
>   -name 'pve,debug-threads=on' \
>   -chardev
> 'socket,id=qmp,path=/var/run/qemu-server/100.qmp,server=on,wait=off' \
>   -mon 'chardev=qmp,mode=control' \
>   -chardev 'socket,id=qmp-event,path=/var/run/qmeventd.sock,reconnect=5' \
>   -mon 'chardev=qmp-event,mode=control' \
>   -pidfile /var/run/qemu-server/100.pid \
>   -smbios 'type=1,uuid=25a3012b-e72a-4121-9e57-f6f5a40ebe62' \
>   -smp '4,sockets=1,cores=4,maxcpus=4' \
>   -nodefaults \
>   -vnc 'unix:/var/run/qemu-server/100.vnc,password=on' \
>   -cpu host,+kvm_pv_eoi,+kvm_pv_unhalt \
>   -m 4096 \
>   -object 'iothread,id=iothread-virtioscsi0' \
>   -device 'pci-bridge,id=pci.1,chassis_nr=1,bus=pci.0,addr=0x1e' \
>   -device 'pci-bridge,id=pci.2,chassis_nr=2,bus=pci.0,addr=0x1f' \
>   -device 'pci-bridge,id=pci.3,chassis_nr=3,bus=pci.0,addr=0x5' \
>   -device 'vmgenid,guid=ab6d50e3-d574-4ba8-bb0c-3c0bbcc9cd2e' \
>   -device 'piix3-usb-uhci,id=uhci,bus=pci.0,addr=0x1.0x2' \
>   -device 'VGA,id=vga,bus=pci.0,addr=0x2' \
>   -device
> 'virtio-scsi-pci,id=virtioscsi0,bus=pci.3,addr=0x1,iothread=iothread-virtioscsi0'
> \
>   -drive
> 'file=/var/lib/vz/images/100/vm-100-disk-0.qcow2,if=none,id=drive-scsi0,format=qcow2,cache=none,aio=io_uring,detect-zeroes=on'
> \
>   -device
> 'scsi-hd,bus=virtioscsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0,id=scsi0,bootindex=100'
> \
>   -machine 'type=pc'
> 
> [2]
> 
> [   85.282301] BUG: kernel NULL pointer dereference, address:
> 0000000000000008
> [   85.283454] #PF: supervisor write access in kernel mode
> [   85.284289] #PF: error_code(0x0002) - not-present page
> [   85.285087] PGD 0 P4D 0
> [   85.285500] Oops: 0002 [#1] PREEMPT SMP NOPTI
> [   85.286211] CPU: 0 PID: 794 Comm: kvm Tainted: G            E
> 6.9.0-debug2 #27
> [   85.287420] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> [   85.289177] RIP: 0010:blk_flush_complete_seq+0x291/0x2d0
> [   85.289994] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2
> 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73
> 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b
> [   85.292894] RSP: 0018:ffff9c08c07f3998 EFLAGS: 00010046
> [   85.293714] RAX: 0000000000000000 RBX: ffff8b9c8abad000 RCX:
> ffff8b9c8abad048
> [   85.294809] RDX: ffff8b9c80a3d6a0 RSI: ffff8b9c8abad048 RDI:
> 0000000000000000
> [   85.295921] RBP: ffff9c08c07f39d8 R08: 0000000000000000 R09:
> 0000000000000000
> [   85.297111] R10: 0000000000000000 R11: 0000000000000000 R12:
> 0000000000009801
> [   85.298211] R13: ffff8b9c80a3d680 R14: ffff8b9c80a3d690 R15:
> ffff8b9c8341ddc0
> [   85.299353] FS:  0000776683e006c0(0000) GS:ffff8b9db7a00000(0000)
> knlGS:0000000000000000
> [   85.300825] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   85.301712] CR2: 0000000000000008 CR3: 000000010e6e4001 CR4:
> 0000000000372ef0
> [   85.302810] Call Trace:
> [   85.303242]  <TASK>
> [   85.303597]  ? show_regs+0x6c/0x80
> [   85.304169]  ? __die+0x24/0x80
> [   85.304671]  ? page_fault_oops+0x175/0x5b0
> [   85.305326]  ? do_user_addr_fault+0x311/0x680
> [   85.306012]  ? exc_page_fault+0x82/0x1b0
> [   85.306657]  ? asm_exc_page_fault+0x27/0x30
> [   85.307297]  ? blk_flush_complete_seq+0x291/0x2d0
> [   85.308023]  ? __blk_mq_alloc_requests+0x251/0x540
> [   85.308785]  ? wbt_wait+0x33/0x100
> [   85.309620]  blk_insert_flush+0xce/0x220
> [   85.310307]  blk_mq_submit_bio+0x5cd/0x730
> [   85.310948]  __submit_bio+0xb3/0x1c0
> [   85.311535]  submit_bio_noacct_nocheck+0x2fc/0x3d0
> [   85.312296]  submit_bio_noacct+0x1ba/0x6b0
> [   85.312934]  ? ext4_file_write_iter+0x3b1/0x7e0
> [   85.313663]  submit_bio+0xb1/0x110
> [   85.314220]  md_super_write+0xcf/0x110
> [   85.314816]  write_sb_page+0x148/0x300
> [   85.315422]  filemap_write_page+0x5b/0x70
> [   85.316058]  md_bitmap_unplug+0x99/0x1f0
> [   85.316676]  flush_bio_list+0x107/0x110 [raid1]
> [   85.317399]  raid1_unplug+0x3c/0xf0 [raid1]
> [   85.318056]  __blk_flush_plug+0xbe/0x140
> [   85.318673]  blk_finish_plug+0x30/0x50
> [   85.319268]  io_submit_sqes+0x4c4/0x6c0
> [   85.319872]  __do_sys_io_uring_enter+0x2f2/0x640
> [   85.320590]  ? blk_mq_complete_request+0x26/0x40
> [   85.321326]  __x64_sys_io_uring_enter+0x22/0x40
> [   85.322044]  x64_sys_call+0x20b9/0x24b0
> [   85.322656]  do_syscall_64+0x80/0x170
> [   85.323243]  ? vring_interrupt+0x94/0x110
> [   85.323878]  ? __x64_sys_ppoll+0xf2/0x170
> [   85.324501]  ? handle_irq_event+0x52/0x80
> [   85.325103]  ? _raw_spin_unlock+0xe/0x40
> [   85.325728]  ? handle_edge_irq+0xda/0x250
> [   85.326347]  ? irqentry_exit_to_user_mode+0x76/0x270
> [   85.327114]  ? irqentry_exit+0x43/0x50
> [   85.327703]  ? clear_bhb_loop+0x15/0x70
> [   85.328286]  ? clear_bhb_loop+0x15/0x70
> [   85.328897]  ? clear_bhb_loop+0x15/0x70
> [   85.329541]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   85.330326] RIP: 0033:0x776695660b95
> [   85.330908] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
> d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
> 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
> [   85.333736] RSP: 002b:0000776683dfafd8 EFLAGS: 00000246 ORIG_RAX:
> 00000000000001aa
> [   85.334911] RAX: ffffffffffffffda RBX: 000077667c0039f0 RCX:
> 0000776695660b95
> [   85.335999] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> 0000000000000033
> [   85.337111] RBP: 000077667c0039f8 R08: 0000000000000000 R09:
> 0000000000000008
> [   85.338210] R10: 0000000000000000 R11: 0000000000000246 R12:
> 000077667c003ae0
> [   85.339304] R13: 0000000000000000 R14: 000064175d5afc68 R15:
> 000064178a3f9450
> [   85.340408]  </TASK>
> [   85.340772] Modules linked in: tcp_diag(E) inet_diag(E) veth(E)
> cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E)
> iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E)
> ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E)
> ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E)
> binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E)
> intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E)
> intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E)
> kvm_intel(E) kvm(E) hid_generic(E) crct10dif_pclmul(E)
> polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E)
> sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E)
> cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E)
> serio_raw(E) mac_hid(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E)
> efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E)
> autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E)
> async_pq(E) async_xor(E) async_tx(E)
> [   85.340859]  xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E)
> crc32_pclmul(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) psmouse(E)
> uhci_hcd(E) ehci_hcd(E) ttm(E) i2c_piix4(E) pata_acpi(E) floppy(E)
> [   85.357137] CR2: 0000000000000008
> [   85.358209] ---[ end trace 0000000000000000 ]---
> [   85.358926] RIP: 0010:blk_flush_complete_seq+0x291/0x2d0
> [   85.360004] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2
> 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73
> 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b
> [   85.362891] RSP: 0018:ffff9c08c07f3998 EFLAGS: 00010046
> [   85.363695] RAX: 0000000000000000 RBX: ffff8b9c8abad000 RCX:
> ffff8b9c8abad048
> [   85.364800] RDX: ffff8b9c80a3d6a0 RSI: ffff8b9c8abad048 RDI:
> 0000000000000000
> [   85.365902] RBP: ffff9c08c07f39d8 R08: 0000000000000000 R09:
> 0000000000000000
> [   85.366993] R10: 0000000000000000 R11: 0000000000000000 R12:
> 0000000000009801
> [   85.368089] R13: ffff8b9c80a3d680 R14: ffff8b9c80a3d690 R15:
> ffff8b9c8341ddc0
> [   85.369190] FS:  0000776683e006c0(0000) GS:ffff8b9db7a00000(0000)
> knlGS:0000000000000000
> [   85.370440] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   85.371327] CR2: 0000000000000008 CR3: 000000010e6e4001 CR4:
> 0000000000372ef0
> [   85.372440] note: kvm[794] exited with irqs disabled
> [   85.373340] note: kvm[794] exited with preempt_count 1
> [   85.374238] ------------[ cut here ]------------
> [   85.374943] WARNING: CPU: 0 PID: 794 at kernel/exit.c:827
> do_exit+0x8a2/0xab0
> [   85.376047] Modules linked in: tcp_diag(E) inet_diag(E) veth(E)
> cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E)
> iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E)
> ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E)
> ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E)
> binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E)
> intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E)
> intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E)
> kvm_intel(E) kvm(E) hid_generic(E) crct10dif_pclmul(E)
> polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E)
> sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E)
> cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E)
> serio_raw(E) mac_hid(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E)
> efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E)
> autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E)
> async_pq(E) async_xor(E) async_tx(E)
> [   85.376128]  xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E)
> crc32_pclmul(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) psmouse(E)
> uhci_hcd(E) ehci_hcd(E) ttm(E) i2c_piix4(E) pata_acpi(E) floppy(E)
> [   85.392395] CPU: 0 PID: 794 Comm: kvm Tainted: G      D     E
> 6.9.0-debug2 #27
> [   85.393594] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> [   85.395347] RIP: 0010:do_exit+0x8a2/0xab0
> [   85.395975] Code: 38 0c 00 00 e9 2d fe ff ff 49 8d 7d 18 e8 b6 83 07
> 00 e9 5a f8 ff ff 48 8b bb f8 09 00 00 31 f6 e8 b3 e0 ff ff e9 c1 fd ff
> ff <0f> 0b e9 d4 f7 ff ff 0f 0b e9 8f f7 ff ff 4c 89 e6 bf 05 06 00 00
> [   85.398793] RSP: 0018:ffff9c08c07f3ec8 EFLAGS: 00010286
> [   85.399625] RAX: 0000000000000000 RBX: ffff8b9c82e90000 RCX:
> 0000000000000000
> [   85.400720] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
> 0000000000000000
> 
> [3]
> 
> 00000000000000c0 <blk_flush_complete_seq>:
> {
> [...]
>        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
>  32a:   49 8d 56 01             lea    0x1(%r14),%rdx
>  32e:   49 c1 e6 04             shl    $0x4,%r14
>  332:   4d 01 ee                add    %r13,%r14
>  335:   48 c1 e2 04             shl    $0x4,%rdx
>         return READ_ONCE(head->next) == head;
>  339:   49 8b 4e 10             mov    0x10(%r14),%rcx
>  33d:   4c 01 ea                add    %r13,%rdx
>                 if (list_empty(pending))
>  340:   48 39 ca                cmp    %rcx,%rdx
>  343:   74 2b                   je     370 <blk_flush_complete_seq+0x2b0>
>         __list_del(entry->prev, entry->next);
>  345:   48 8b 4b 50             mov    0x50(%rbx),%rcx
>  349:   48 8b 7b 48             mov    0x48(%rbx),%rdi
>                 list_move_tail(&rq->queuelist, pending);
>  34d:   48 8d 73 48             lea    0x48(%rbx),%rsi
>         next->prev = prev;
>  351:   48 89 4f 08             mov    %rcx,0x8(%rdi)
>         WRITE_ONCE(prev->next, next);
>  355:   48 89 39                mov    %rdi,(%rcx)
>         __list_add(new, head->prev, head);
>  358:   49 8b 4e 18             mov    0x18(%r14),%rcx
>         next->prev = new;
>  35c:   49 89 76 18             mov    %rsi,0x18(%r14)
>         new->next = next;
>  360:   48 89 53 48             mov    %rdx,0x48(%rbx)
>         new->prev = prev;
>  364:   48 89 4b 50             mov    %rcx,0x50(%rbx)
>         WRITE_ONCE(prev->next, new);
>  368:   48 89 31                mov    %rsi,(%rcx)
> }
>  36b:   e9 30 fe ff ff          jmp    1a0 <blk_flush_complete_seq+0xe0>
>                         fq->flush_pending_since = jiffies;
>  370:   48 8b 0d 00 00 00 00    mov    0x0(%rip),%rcx        # 377
> <blk_flush_complete_seq+0x2b7>
>  377:   49 89 4d 08             mov    %rcx,0x8(%r13)
>  37b:   eb c8                   jmp    345 <blk_flush_complete_seq+0x285>
> 
> [4] May 27 11:12:53 reproflushfull kernel: list_del corruption.
> prev->next should be ffff8ad44ab63788, but was ffff8ad44ab63400.
> (prev=ffff8ad44ab63788)
> May 27 11:12:53 reproflushfull kernel: WARNING: CPU: 0 PID: 797 at
> lib/list_debug.c:62 __list_del_entry_valid_or_report+0xd4/0x100
> May 27 11:12:53 reproflushfull kernel: Modules linked in: veth(E)
> cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E)
> iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E)
> ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E)
> ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E)
> binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E)
> intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E)
> intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E)
> kvm_intel(E) kvm(E) crct10dif_pclmul(E) polyval_clmulni(E)
> polyval_generic(E) hid_generic(E) ghash_clmulni_intel(E) sha256_ssse3(E)
> sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E) cryptd(E) hid(E)
> rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E)
> serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E)
> dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E)
> raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E)
> async_xor(E) async_tx(E) xor(E) raid6_pq(E)
> May 27 11:12:53 reproflushfull kernel:  libcrc32c(E) raid0(E) raid1(E)
> crc32_pclmul(E) bochs(E) psmouse(E) drm_vram_helper(E) drm_ttm_helper(E)
> uhci_hcd(E) ttm(E) i2c_piix4(E) ehci_hcd(E) pata_acpi(E) floppy(E)
> May 27 11:12:53 reproflushfull kernel: CPU: 0 PID: 797 Comm: kvm
> Tainted: G            E      6.9.0-debug #25
> May 27 11:12:53 reproflushfull kernel: Hardware name: QEMU Standard PC
> (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org
> 04/01/2014
> May 27 11:12:53 reproflushfull kernel: RIP:
> 0010:__list_del_entry_valid_or_report+0xd4/0x100
> May 27 11:12:53 reproflushfull kernel: Code: 48 89 fe 48 89 ca 48 c7 c7
> 78 77 9f a1 e8 84 3a 8e ff 0f 0b 31 c0 eb b0 48 89 fe 48 89 c2 48 c7 c7
> b0 77 9f a1 e8 6c 3a 8e ff <0f> 0b 31 c0 eb 98 48 89 d1 48 c7 c7 f8 77
> 9f a1 48 89 f2 48 89 c6
> May 27 11:12:53 reproflushfull kernel: RSP: 0018:ffffb1dd80673740
> EFLAGS: 00010046
> May 27 11:12:53 reproflushfull kernel: RAX: 0000000000000000 RBX:
> ffff8ad440be12c0 RCX: 0000000000000000
> May 27 11:12:53 reproflushfull kernel: RDX: 0000000000000000 RSI:
> 0000000000000000 RDI: 0000000000000000
> May 27 11:12:53 reproflushfull kernel: RBP: ffffb1dd80673740 R08:
> 0000000000000000 R09: 0000000000000000
> May 27 11:12:53 reproflushfull kernel: R10: 0000000000000000 R11:
> 0000000000000000 R12: ffff8ad44a8c1e00
> May 27 11:12:53 reproflushfull kernel: R13: ffff8ad440be12c0 R14:
> ffff8ad44ab63788 R15: ffff8ad44ab63740
> May 27 11:12:53 reproflushfull kernel: FS:  00007e8a67c006c0(0000)
> GS:ffff8ad577a00000(0000) knlGS:0000000000000000
> May 27 11:12:53 reproflushfull kernel: CS:  0010 DS: 0000 ES: 0000 CR0:
> 0000000080050033
> May 27 11:12:53 reproflushfull kernel: CR2: 00007e8a5801a008 CR3:
> 000000010e022001 CR4: 0000000000372ef0
> May 27 11:12:53 reproflushfull kernel: Call Trace:
> May 27 11:12:53 reproflushfull kernel:  <TASK>
> May 27 11:12:53 reproflushfull kernel:  ? show_regs+0x6c/0x80
> May 27 11:12:53 reproflushfull kernel:  ? __warn+0x88/0x140
> May 27 11:12:53 reproflushfull kernel:  ?
> __list_del_entry_valid_or_report+0xd4/0x100
> May 27 11:12:53 reproflushfull kernel:  ? report_bug+0x182/0x1b0
> May 27 11:12:53 reproflushfull kernel:  ? handle_bug+0x46/0x90
> May 27 11:12:53 reproflushfull kernel:  ? exc_invalid_op+0x18/0x80
> May 27 11:12:53 reproflushfull kernel:  ? asm_exc_invalid_op+0x1b/0x20
> May 27 11:12:53 reproflushfull kernel:  ?
> __list_del_entry_valid_or_report+0xd4/0x100
> May 27 11:12:53 reproflushfull kernel:  blk_flush_complete_seq+0x2f3/0x360
> May 27 11:12:53 reproflushfull kernel:  ? wbt_wait+0x33/0x100
> May 27 11:12:53 reproflushfull kernel:  blk_insert_flush+0xce/0x220
> May 27 11:12:53 reproflushfull kernel:  blk_mq_submit_bio+0x5cd/0x730
> May 27 11:12:53 reproflushfull kernel:  __submit_bio+0xb3/0x1c0
> May 27 11:12:53 reproflushfull kernel:
> submit_bio_noacct_nocheck+0x2fc/0x3d0
> May 27 11:12:53 reproflushfull kernel:  submit_bio_noacct+0x1ba/0x6b0
> May 27 11:12:53 reproflushfull kernel:  submit_bio+0xb1/0x110
> May 27 11:12:53 reproflushfull kernel:  md_super_write+0xcf/0x110
> May 27 11:12:53 reproflushfull kernel:  write_sb_page+0x148/0x300
> May 27 11:12:53 reproflushfull kernel:  filemap_write_page+0x5b/0x70
> May 27 11:12:53 reproflushfull kernel:  md_bitmap_unplug+0x99/0x1f0
> May 27 11:12:53 reproflushfull kernel:  flush_bio_list+0x107/0x110 [raid1]
> May 27 11:12:53 reproflushfull kernel:  raid1_unplug+0x3c/0xf0 [raid1]
> May 27 11:12:53 reproflushfull kernel:  __blk_flush_plug+0xd9/0x170
> May 27 11:12:53 reproflushfull kernel:  blk_finish_plug+0x30/0x50
> May 27 11:12:53 reproflushfull kernel:  io_submit_sqes+0x4c4/0x6c0
> May 27 11:12:53 reproflushfull kernel:  __do_sys_io_uring_enter+0x2f2/0x640
> May 27 11:12:53 reproflushfull kernel:  __x64_sys_io_uring_enter+0x22/0x40
> May 27 11:12:53 reproflushfull kernel:  x64_sys_call+0x20b9/0x24b0
> May 27 11:12:53 reproflushfull kernel:  do_syscall_64+0x80/0x170
> May 27 11:12:53 reproflushfull kernel:  ? do_mprotect_pkey+0x198/0x620
> May 27 11:12:53 reproflushfull kernel:  ?
> __memcg_slab_post_alloc_hook+0x18e/0x230
> May 27 11:12:53 reproflushfull kernel:  ? policy_nodemask+0x145/0x180
> May 27 11:12:53 reproflushfull kernel:  ?
> __mod_memcg_lruvec_state+0x87/0x120
> May 27 11:12:53 reproflushfull kernel:  ? __mod_lruvec_state+0x36/0x50
> May 27 11:12:53 reproflushfull kernel:  ? __lruvec_stat_mod_folio+0x70/0xc0
> May 27 11:12:53 reproflushfull kernel:  ? set_ptes.constprop.0+0x2b/0xb0
> May 27 11:12:53 reproflushfull kernel:  ? _raw_spin_unlock+0xe/0x40
> May 27 11:12:53 reproflushfull kernel:  ? do_anonymous_page+0x23d/0x790
> May 27 11:12:53 reproflushfull kernel:  ? __pte_offset_map+0x1c/0x1b0
> May 27 11:12:53 reproflushfull kernel:  ? __handle_mm_fault+0xc1a/0xe90
> May 27 11:12:53 reproflushfull kernel:  ? do_syscall_64+0x8c/0x170
> May 27 11:12:53 reproflushfull kernel:  ? __count_memcg_events+0x6f/0xe0
> May 27 11:12:53 reproflushfull kernel:  ?
> count_memcg_events.constprop.0+0x2a/0x50
> May 27 11:12:53 reproflushfull kernel:  ? handle_mm_fault+0xaf/0x340
> May 27 11:12:53 reproflushfull kernel:  ? do_user_addr_fault+0x365/0x680
> May 27 11:12:53 reproflushfull kernel:  ?
> irqentry_exit_to_user_mode+0x76/0x270
> May 27 11:12:53 reproflushfull kernel:  ? irqentry_exit+0x43/0x50
> May 27 11:12:53 reproflushfull kernel:  ? clear_bhb_loop+0x15/0x70
> May 27 11:12:53 reproflushfull kernel:  ? clear_bhb_loop+0x15/0x70
> May 27 11:12:53 reproflushfull kernel:  ? clear_bhb_loop+0x15/0x70
> May 27 11:12:53 reproflushfull kernel:
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
> May 27 11:12:53 reproflushfull kernel: RIP: 0033:0x7e8a75069b95
> May 27 11:12:53 reproflushfull kernel: Code: 00 00 00 44 89 d0 41 b9 08
> 00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45
> 31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2
> 02 74 c2 f0 48 83 0c 24
> May 27 11:12:53 reproflushfull kernel: RSP: 002b:00007e8a67bfafd8
> EFLAGS: 00000246 ORIG_RAX: 00000000000001aa
> May 27 11:12:53 reproflushfull kernel: RAX: ffffffffffffffda RBX:
> 00007e8a580039f0 RCX: 00007e8a75069b95
> May 27 11:12:53 reproflushfull kernel: RDX: 0000000000000000 RSI:
> 0000000000000020 RDI: 0000000000000033
> May 27 11:12:53 reproflushfull kernel: RBP: 00007e8a580039f8 R08:
> 0000000000000000 R09: 0000000000000008
> May 27 11:12:53 reproflushfull kernel: R10: 0000000000000000 R11:
> 0000000000000246 R12: 00007e8a58003ae0
> May 27 11:12:53 reproflushfull kernel: R13: 0000000000000000 R14:
> 000064a145ca1c68 R15: 000064a183935450
> May 27 11:12:53 reproflushfull kernel:  </TASK>
> May 27 11:12:53 reproflushfull kernel: ---[ end trace 0000000000000000 ]---
>
Chengming Zhou May 27, 2024, 11:50 p.m. UTC | #5
On 2024/5/28 07:34, Chengming Zhou wrote:
> On 2024/5/28 00:04, Friedrich Weber wrote:
>> Hi Chengming,
>>
>> Thank you for taking a look at this!
>>
>> On 27/05/2024 07:09, Chengming Zhou wrote:
>>>> I've used this reproducer for a bisect, which produced
>>>>
>>>>  81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine)
>>>>
>>>> as the first commit with which I can reproduce the crashes. I'm not 100%
>>>> sure it is this one because the reproducer is a bit flaky. But it does
>>>> sound plausible, as the commit is included in our 6.8 kernel, and
>>>> touches `queuelist` which is AFAICT where blk_flush_complete_seq
>>>> dereferences the NULL pointer.
>>>
>>> Ok, it will be better that I can reproduce it locally, will try later.
>>
>> Interestingly, so far I haven't been able to reproduce the crash when
>> generating IO on the host itself, I only got crashes when generating IO
>> in a QEMU VM.
>>
>> The reproducer in more detail:
> 
> Thanks for these details, I will try to setup and reproduce when I back to work.
> 
>>
>> - Compile Linux 6.9 with CONFIG_FAULT_INJECTION,
> [...]
>>>
>>> BUG shows it panic on 0000000000000008, not sure what it's accessing then,
>>> does it means rq->queuelist.next == 0 or something? Could you use add2line
>>> to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0
>>> and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;",
>>> obviously fq can't be NULL. (I'm using the v6.9 kernel)
>>
>> Sorry for the confusion, the crash dump was from a kernel compiled at
>> 81ada09cc25e -- with 6.9, the offset seems to be different. See [2] for
>> a kernel 6.9 crash dump.
>>
>> I don't know too much about kernel debugging, but I tried to get
>> something useful out of addr2line:
>>
>> # addr2line -f -e /usr/lib/debug/vmlinux-6.9.0-debug2
>> blk_flush_complete_seq+0x291/0x2d0
>> __list_del
>> /[...]./include/linux/list.h:195
>>
>> I tried to find the relevant portions in `objdump -SD blk-flush.o`, see
>> [3]. If I'm not mistaken, blk_flush_complete_seq+0x291 should point to
>>
>> 351:   48 89 4f 08             mov    %rcx,0x8(%rdi)
>>
>> To me this looks like part of
>>
>> 	list_move_tail(&rq->queuelist, pending);
>>
>> What do you think?
> 
> Yeah, it seems correct, so the rq->queuelist.next == NULL. It can't be NULL
> if went through REQ_FSEQ_POSTFLUSH, so it must be REQ_FSEQ_PREFLUSH. It means
> we allocated a request but its queuelist is not initialized or corrupted?
> 
> Anyway, I will use below changes for debugging when reproduce, and you could
> also try this to see if we could get something useful. :)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 3b4df8e5ac9e..6e3a6cd7739d 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2989,6 +2989,8 @@ void blk_mq_submit_bio(struct bio *bio)
>                 blk_mq_use_cached_rq(rq, plug, bio);
>         }
> 
> +       BUG_ON(rq->queuelist.next == NULL);
> +
>         trace_block_getrq(bio);
> 
>         rq_qos_track(q, rq, bio);
> @@ -3006,6 +3008,8 @@ void blk_mq_submit_bio(struct bio *bio)
>         if (bio_zone_write_plugging(bio))
>                 blk_zone_write_plug_init_request(rq);
> 
> +       BUG_ON(rq->queuelist.next == NULL);
> +
>         if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
>                 return;
> 

Ah, I forgot to change to your kernel version, then should be:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d98654869615..908fdfb62132 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2963,6 +2963,8 @@ void blk_mq_submit_bio(struct bio *bio)
                        return;
        }

+       BUG_ON(rq->queuelist.next == NULL);
+
        trace_block_getrq(bio);

        rq_qos_track(q, rq, bio);
@@ -2977,6 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio)
                return;
        }

+       BUG_ON(rq->queuelist.next == NULL);
+
        if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
                return;
Chengming Zhou May 28, 2024, 12:12 a.m. UTC | #6
On 2024/5/28 07:50, Chengming Zhou wrote:
> On 2024/5/28 07:34, Chengming Zhou wrote:
>> On 2024/5/28 00:04, Friedrich Weber wrote:
>>> Hi Chengming,
>>>
>>> Thank you for taking a look at this!
>>>
>>> On 27/05/2024 07:09, Chengming Zhou wrote:
>>>>> I've used this reproducer for a bisect, which produced
>>>>>
>>>>>  81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine)
>>>>>
>>>>> as the first commit with which I can reproduce the crashes. I'm not 100%
>>>>> sure it is this one because the reproducer is a bit flaky. But it does
>>>>> sound plausible, as the commit is included in our 6.8 kernel, and
>>>>> touches `queuelist` which is AFAICT where blk_flush_complete_seq
>>>>> dereferences the NULL pointer.
>>>>
>>>> Ok, it will be better that I can reproduce it locally, will try later.
>>>
>>> Interestingly, so far I haven't been able to reproduce the crash when
>>> generating IO on the host itself, I only got crashes when generating IO
>>> in a QEMU VM.
>>>
>>> The reproducer in more detail:
>>
>> Thanks for these details, I will try to setup and reproduce when I back to work.
>>
>>>
>>> - Compile Linux 6.9 with CONFIG_FAULT_INJECTION,
>> [...]
>>>>
>>>> BUG shows it panic on 0000000000000008, not sure what it's accessing then,
>>>> does it means rq->queuelist.next == 0 or something? Could you use add2line
>>>> to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0
>>>> and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;",
>>>> obviously fq can't be NULL. (I'm using the v6.9 kernel)
>>>
>>> Sorry for the confusion, the crash dump was from a kernel compiled at
>>> 81ada09cc25e -- with 6.9, the offset seems to be different. See [2] for
>>> a kernel 6.9 crash dump.
>>>
>>> I don't know too much about kernel debugging, but I tried to get
>>> something useful out of addr2line:
>>>
>>> # addr2line -f -e /usr/lib/debug/vmlinux-6.9.0-debug2
>>> blk_flush_complete_seq+0x291/0x2d0
>>> __list_del
>>> /[...]./include/linux/list.h:195
>>>
>>> I tried to find the relevant portions in `objdump -SD blk-flush.o`, see
>>> [3]. If I'm not mistaken, blk_flush_complete_seq+0x291 should point to
>>>
>>> 351:   48 89 4f 08             mov    %rcx,0x8(%rdi)
>>>
>>> To me this looks like part of
>>>
>>> 	list_move_tail(&rq->queuelist, pending);
>>>
>>> What do you think?
>>
>> Yeah, it seems correct, so the rq->queuelist.next == NULL. It can't be NULL
>> if went through REQ_FSEQ_POSTFLUSH, so it must be REQ_FSEQ_PREFLUSH. It means
>> we allocated a request but its queuelist is not initialized or corrupted?
>>
>> Anyway, I will use below changes for debugging when reproduce, and you could
>> also try this to see if we could get something useful. :)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 3b4df8e5ac9e..6e3a6cd7739d 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -2989,6 +2989,8 @@ void blk_mq_submit_bio(struct bio *bio)
>>                 blk_mq_use_cached_rq(rq, plug, bio);
>>         }
>>
>> +       BUG_ON(rq->queuelist.next == NULL);
>> +
>>         trace_block_getrq(bio);
>>
>>         rq_qos_track(q, rq, bio);
>> @@ -3006,6 +3008,8 @@ void blk_mq_submit_bio(struct bio *bio)
>>         if (bio_zone_write_plugging(bio))
>>                 blk_zone_write_plug_init_request(rq);
>>
>> +       BUG_ON(rq->queuelist.next == NULL);
>> +
>>         if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
>>                 return;
>>
> 
> Ah, I forgot to change to your kernel version, then should be:
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d98654869615..908fdfb62132 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2963,6 +2963,8 @@ void blk_mq_submit_bio(struct bio *bio)
>                         return;
>         }
> 
> +       BUG_ON(rq->queuelist.next == NULL);
> +
>         trace_block_getrq(bio);
> 
>         rq_qos_track(q, rq, bio);
> @@ -2977,6 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio)
>                 return;
>         }
> 
> +       BUG_ON(rq->queuelist.next == NULL);
> +
>         if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
>                 return;
> 

Another possibility is that drivers may change rq->queuelist even after
rq->end_io(). So add two more BUG_ON() to detect this:

diff --git a/block/blk-flush.c b/block/blk-flush.c
index e73dc22d05c1..0eb684a468e5 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -179,7 +179,10 @@ static void blk_flush_complete_seq(struct request *rq,

        switch (seq) {
        case REQ_FSEQ_PREFLUSH:
+               BUG_ON(rq->queuelist.next == NULL);
+               fallthrough;
        case REQ_FSEQ_POSTFLUSH:
+               BUG_ON(rq->queuelist.next == NULL);
                /* queue for flush */
                if (list_empty(pending))
                        fq->flush_pending_since = jiffies;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d98654869615..908fdfb62132 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2963,6 +2963,8 @@ void blk_mq_submit_bio(struct bio *bio)
                        return;
        }

+       BUG_ON(rq->queuelist.next == NULL);
+
        trace_block_getrq(bio);

        rq_qos_track(q, rq, bio);
@@ -2977,6 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio)
                return;
        }

+       BUG_ON(rq->queuelist.next == NULL);
+
        if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
                return;
Friedrich Weber May 28, 2024, 8:42 a.m. UTC | #7
Hi Chengming,

Thanks again!

On 28/05/2024 02:12, Chengming Zhou wrote:
> Another possibility is that drivers may change rq->queuelist even after
> rq->end_io(). So add two more BUG_ON() to detect this:
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index e73dc22d05c1..0eb684a468e5 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -179,7 +179,10 @@ static void blk_flush_complete_seq(struct request *rq,
> 
>         switch (seq) {
>         case REQ_FSEQ_PREFLUSH:
> +               BUG_ON(rq->queuelist.next == NULL);
> +               fallthrough;
>         case REQ_FSEQ_POSTFLUSH:
> +               BUG_ON(rq->queuelist.next == NULL);
>                 /* queue for flush */
>                 if (list_empty(pending))
>                         fq->flush_pending_since = jiffies;
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d98654869615..908fdfb62132 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2963,6 +2963,8 @@ void blk_mq_submit_bio(struct bio *bio)
>                         return;
>         }
> 
> +       BUG_ON(rq->queuelist.next == NULL);
> +
>         trace_block_getrq(bio);
> 
>         rq_qos_track(q, rq, bio);
> @@ -2977,6 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio)
>                 return;
>         }
> 
> +       BUG_ON(rq->queuelist.next == NULL);
> +
>         if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
>                 return;
> 

I applied the following patch (with slightly different line numbers) to
my checkout of Linux 6.9:

diff --git a/block/blk-flush.c b/block/blk-flush.c
index b0f314f4bc14..07e0de3b28ef 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -179,7 +179,10 @@ static void blk_flush_complete_seq(struct request *rq,

        switch (seq) {
        case REQ_FSEQ_PREFLUSH:
+               BUG_ON(rq->queuelist.next == NULL);
+               fallthrough;
        case REQ_FSEQ_POSTFLUSH:
+               BUG_ON(rq->queuelist.next == NULL);
                /* queue for flush */
                if (list_empty(pending))
                        fq->flush_pending_since = jiffies;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32afb87efbd0..13a8f4dd1e5b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2988,6 +2988,8 @@ void blk_mq_submit_bio(struct bio *bio)
                blk_mq_use_cached_rq(rq, plug, bio);
        }

+       BUG_ON(rq->queuelist.next == NULL);
+
        trace_block_getrq(bio);

        rq_qos_track(q, rq, bio);
@@ -3002,6 +3004,8 @@ void blk_mq_submit_bio(struct bio *bio)
                return;
        }

+       BUG_ON(rq->queuelist.next == NULL);
+
        if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
                return;

Hope I did this correctly. With this, the reproducer triggered a BUG
pretty quickly, see [0]. If I can provide anything else, just let me know.

I forgot to mention earlier that I'm running the reproducer against a
QEMU/KVM guest too, for convenience -- so, the "host" from my previous
post is in fact running virtualized. I don't think it should be relevant
here, but I'll try to run the reproducer against real hardware too.

Best,

Friedrich

[0]

[   37.543834] kernel BUG at block/blk-mq.c:2991!
[   37.544458] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[   37.545077] CPU: 4 PID: 795 Comm: kvm Tainted: G            E
6.9.0-troubleshoot1-dirty #28
[   37.546087] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[   37.547389] RIP: 0010:blk_mq_submit_bio+0x6f5/0x750
[   37.547949] Code: ff e9 aa fc ff ff e8 7a 3c a4 ff 48 89 de 4c 89 e7
e8 9f ef ff ff e8 da 79 a4 ff e9 99 fc ff ff e8 d0 45 a7 ff e9 cd fa ff
ff <0f> 0b 0f 0b 48 8b 03 48 8b 40 68 48 8b 40 40 e9 8c fb ff ff f6 43
[   37.550088] RSP: 0018:ffffa2b480863a70 EFLAGS: 00010246
[   37.550685] RAX: 0000000000000000 RBX: ffff971c0b29d000 RCX:
0000000000000000
[   37.551502] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000000
[   37.552325] RBP: ffffa2b480863ae8 R08: 0000000000000000 R09:
0000000000000000
[   37.553130] R10: 0000000000000000 R11: 0000000000000000 R12:
ffff971c106ee600
[   37.553934] R13: ffff971c0fb328f0 R14: ffff971c03b330c0 R15:
0000000000000001
[   37.554747] FS:  00007bcc47e006c0(0000) GS:ffff971d37c00000(0000)
knlGS:0000000000000000
[   37.555663] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   37.556311] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4:
0000000000372ef0
[   37.557117] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[   37.557935] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[   37.558789] Call Trace:
[   37.559080]  <TASK>
[   37.559355]  ? show_regs+0x6c/0x80
[   37.560276]  ? die+0x37/0xa0
[   37.560633]  ? do_trap+0xda/0xf0
[   37.561023]  ? do_error_trap+0x71/0xb0
[   37.561486]  ? blk_mq_submit_bio+0x6f5/0x750
[   37.562008]  ? exc_invalid_op+0x52/0x80
[   37.562463]  ? blk_mq_submit_bio+0x6f5/0x750
[   37.562987]  ? asm_exc_invalid_op+0x1b/0x20
[   37.563492]  ? blk_mq_submit_bio+0x6f5/0x750
[   37.564007]  __submit_bio+0xb3/0x1c0
[   37.564439]  submit_bio_noacct_nocheck+0x2fc/0x3d0
[   37.565019]  submit_bio_noacct+0x1ba/0x6b0
[   37.565508]  flush_bio_list+0x71/0x110 [raid1]
[   37.566049]  raid1_unplug+0x3c/0xf0 [raid1]
[   37.566539]  __blk_flush_plug+0xbe/0x140
[   37.567016]  blk_finish_plug+0x30/0x50
[   37.567466]  io_submit_sqes+0x4c4/0x6c0
[   37.567942]  ? syscall_exit_to_user_mode+0x81/0x270
[   37.568530]  __do_sys_io_uring_enter+0x2f2/0x640
[   37.569070]  __x64_sys_io_uring_enter+0x22/0x40
[   37.569626]  x64_sys_call+0x20b9/0x24b0
[   37.570071]  do_syscall_64+0x80/0x170
[   37.570525]  ? vfs_read+0x240/0x380
[   37.570937]  ? ksys_read+0xe6/0x100
[   37.571360]  ? syscall_exit_to_user_mode+0x81/0x270
[   37.571930]  ? do_syscall_64+0x8c/0x170
[   37.572402]  ? syscall_exit_to_user_mode+0x81/0x270
[   37.572969]  ? do_syscall_64+0x8c/0x170
[   37.573438]  ? irqentry_exit+0x43/0x50
[   37.573878]  ? clear_bhb_loop+0x15/0x70
[   37.574348]  ? clear_bhb_loop+0x15/0x70
[   37.574806]  ? clear_bhb_loop+0x15/0x70
[   37.575282]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   37.575891] RIP: 0033:0x7bcc58f45b95
[   37.576335] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
[   37.578468] RSP: 002b:00007bcc47dfafd8 EFLAGS: 00000246 ORIG_RAX:
00000000000001aa
[   37.579359] RAX: ffffffffffffffda RBX: 00007bcc400039d0 RCX:
00007bcc58f45b95
[   37.580198] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
0000000000000033
[   37.581028] RBP: 00007bcc400039d8 R08: 0000000000000000 R09:
0000000000000008
[   37.581864] R10: 0000000000000000 R11: 0000000000000246 R12:
00007bcc40003ac0
[   37.582695] R13: 0000000000000000 R14: 0000591ad05f5c68 R15:
0000591b10198450
[   37.583534]  </TASK>
[   37.583812] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E)
cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E)
cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E)
ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E)
iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E)
bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E)
intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E)
intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E)
crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E)
ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) hid_generic(E)
aesni_intel(E) crypto_simd(E) cryptd(E) usbhid(E) hid(E) rapl(E)
pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E)
vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E)
qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E)
async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E)
async_tx(E) xor(E) raid6_pq(E)
[   37.583875]  libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E)
psmouse(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E)
ttm(E) ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E)
[   37.595967] ---[ end trace 0000000000000000 ]---
[   37.596555] RIP: 0010:blk_mq_submit_bio+0x6f5/0x750
[   37.597144] Code: ff e9 aa fc ff ff e8 7a 3c a4 ff 48 89 de 4c 89 e7
e8 9f ef ff ff e8 da 79 a4 ff e9 99 fc ff ff e8 d0 45 a7 ff e9 cd fa ff
ff <0f> 0b 0f 0b 48 8b 03 48 8b 40 68 48 8b 40 40 e9 8c fb ff ff f6 43
[   37.599320] RSP: 0018:ffffa2b480863a70 EFLAGS: 00010246
[   37.599932] RAX: 0000000000000000 RBX: ffff971c0b29d000 RCX:
0000000000000000
[   37.600757] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000000
[   37.601600] RBP: ffffa2b480863ae8 R08: 0000000000000000 R09:
0000000000000000
[   37.602450] R10: 0000000000000000 R11: 0000000000000000 R12:
ffff971c106ee600
[   37.603299] R13: ffff971c0fb328f0 R14: ffff971c03b330c0 R15:
0000000000000001
[   37.604129] FS:  00007bcc47e006c0(0000) GS:ffff971d37c00000(0000)
knlGS:0000000000000000
[   37.605079] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   37.605774] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4:
0000000000372ef0
[   37.606620] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[   37.607464] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[   37.608335] ------------[ cut here ]------------
[   37.608885] WARNING: CPU: 4 PID: 795 at kernel/exit.c:827
do_exit+0x8a2/0xab0
[   37.609754] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E)
cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E)
cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E)
ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E)
iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E)
bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E)
intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E)
intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E)
crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E)
ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) hid_generic(E)
aesni_intel(E) crypto_simd(E) cryptd(E) usbhid(E) hid(E) rapl(E)
pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E)
vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E)
qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E)
async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E)
async_tx(E) xor(E) raid6_pq(E)
[   37.609827]  libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E)
psmouse(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E)
ttm(E) ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E)
[   37.622013] CPU: 4 PID: 795 Comm: kvm Tainted: G      D     E
6.9.0-troubleshoot1-dirty #28
[   37.623045] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[   37.624382] RIP: 0010:do_exit+0x8a2/0xab0
[   37.624869] Code: 38 0c 00 00 e9 2d fe ff ff 49 8d 7d 18 e8 b6 83 07
00 e9 5a f8 ff ff 48 8b bb f8 09 00 00 31 f6 e8 b3 e0 ff ff e9 c1 fd ff
ff <0f> 0b e9 d4 f7 ff ff 0f 0b e9 8f f7 ff ff 4c 89 e6 bf 05 06 00 00
[   37.627117] RSP: 0018:ffffa2b480863ec8 EFLAGS: 00010286
[   37.627780] RAX: 0000000000000000 RBX: ffff971c05458000 RCX:
0000000000000000
[   37.628638] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000000
[   37.629512] RBP: ffffa2b480863f20 R08: 0000000000000000 R09:
0000000000000000
[   37.630379] R10: 0000000000000000 R11: 0000000000000000 R12:
000000000000000b
[   37.631231] R13: ffff971c04c8d100 R14: ffff971c00c28000 R15:
ffffa2b4808639c8
[   37.632095] FS:  00007bcc47e006c0(0000) GS:ffff971d37c00000(0000)
knlGS:0000000000000000
[   37.633090] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   37.633794] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4:
0000000000372ef0
[   37.634666] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[   37.635533] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[   37.636387] Call Trace:
[   37.636692]  <TASK>
[   37.636963]  ? show_regs+0x6c/0x80
[   37.637393]  ? __warn+0x88/0x140
[   37.637801]  ? do_exit+0x8a2/0xab0
[   37.638197]  ? report_bug+0x182/0x1b0
[   37.638674]  ? handle_bug+0x46/0x90
[   37.639094]  ? exc_invalid_op+0x18/0x80
[   37.639577]  ? asm_exc_invalid_op+0x1b/0x20
[   37.640090]  ? do_exit+0x8a2/0xab0
[   37.640537]  ? do_exit+0x6f/0xab0
[   37.640941]  ? syscall_exit_to_user_mode+0x81/0x270
[   37.641542]  make_task_dead+0x81/0x160
[   37.641999]  rewind_stack_and_make_dead+0x16/0x20
[   37.642586] RIP: 0033:0x7bcc58f45b95
[   37.643046] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
[   37.645251] RSP: 002b:00007bcc47dfafd8 EFLAGS: 00000246 ORIG_RAX:
00000000000001aa
[   37.646213] RAX: ffffffffffffffda RBX: 00007bcc400039d0 RCX:
00007bcc58f45b95
[   37.647117] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
0000000000000033
[   37.648018] RBP: 00007bcc400039d8 R08: 0000000000000000 R09:
0000000000000008
[   37.648916] R10: 0000000000000000 R11: 0000000000000246 R12:
00007bcc40003ac0
[   37.649824] R13: 0000000000000000 R14: 0000591ad05f5c68 R15:
0000591b10198450
[   37.650760]  </TASK>
[   37.651034] ---[ end trace 0000000000000000 ]---
Chengming Zhou May 28, 2024, 9:09 a.m. UTC | #8
On 2024/5/28 16:42, Friedrich Weber wrote:
> Hope I did this correctly. With this, the reproducer triggered a BUG
> pretty quickly, see [0]. If I can provide anything else, just let me know.

Thanks for your patience, it's correct. Then how about this fix?

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d98654869615..b2ec5c4c738f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -485,6 +485,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
        if (data->nr_tags > 1) {
                rq = __blk_mq_alloc_requests_batch(data);
                if (rq) {
+                       INIT_LIST_HEAD(&rq->queuelist);
                        blk_mq_rq_time_init(rq, alloc_time_ns);
                        return rq;
                }

> 
> I forgot to mention earlier that I'm running the reproducer against a
> QEMU/KVM guest too, for convenience -- so, the "host" from my previous
> post is in fact running virtualized. I don't think it should be relevant
> here, but I'll try to run the reproducer against real hardware too.

Yeah, I think it should not be relevant.

Thanks.

> 
> Best,
> 
> Friedrich
> 
> [0]
> 
> [   37.543834] kernel BUG at block/blk-mq.c:2991!
> [   37.544458] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> [   37.545077] CPU: 4 PID: 795 Comm: kvm Tainted: G            E
> 6.9.0-troubleshoot1-dirty #28
> [   37.546087] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> [   37.547389] RIP: 0010:blk_mq_submit_bio+0x6f5/0x750
> [   37.547949] Code: ff e9 aa fc ff ff e8 7a 3c a4 ff 48 89 de 4c 89 e7
> e8 9f ef ff ff e8 da 79 a4 ff e9 99 fc ff ff e8 d0 45 a7 ff e9 cd fa ff
> ff <0f> 0b 0f 0b 48 8b 03 48 8b 40 68 48 8b 40 40 e9 8c fb ff ff f6 43
> [   37.550088] RSP: 0018:ffffa2b480863a70 EFLAGS: 00010246
> [   37.550685] RAX: 0000000000000000 RBX: ffff971c0b29d000 RCX:
> 0000000000000000
> [   37.551502] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
> 0000000000000000
> [   37.552325] RBP: ffffa2b480863ae8 R08: 0000000000000000 R09:
> 0000000000000000
> [   37.553130] R10: 0000000000000000 R11: 0000000000000000 R12:
> ffff971c106ee600
> [   37.553934] R13: ffff971c0fb328f0 R14: ffff971c03b330c0 R15:
> 0000000000000001
> [   37.554747] FS:  00007bcc47e006c0(0000) GS:ffff971d37c00000(0000)
> knlGS:0000000000000000
> [   37.555663] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   37.556311] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4:
> 0000000000372ef0
> [   37.557117] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [   37.557935] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [   37.558789] Call Trace:
> [   37.559080]  <TASK>
> [   37.559355]  ? show_regs+0x6c/0x80
> [   37.560276]  ? die+0x37/0xa0
> [   37.560633]  ? do_trap+0xda/0xf0
> [   37.561023]  ? do_error_trap+0x71/0xb0
> [   37.561486]  ? blk_mq_submit_bio+0x6f5/0x750
> [   37.562008]  ? exc_invalid_op+0x52/0x80
> [   37.562463]  ? blk_mq_submit_bio+0x6f5/0x750
> [   37.562987]  ? asm_exc_invalid_op+0x1b/0x20
> [   37.563492]  ? blk_mq_submit_bio+0x6f5/0x750
> [   37.564007]  __submit_bio+0xb3/0x1c0
> [   37.564439]  submit_bio_noacct_nocheck+0x2fc/0x3d0
> [   37.565019]  submit_bio_noacct+0x1ba/0x6b0
> [   37.565508]  flush_bio_list+0x71/0x110 [raid1]
> [   37.566049]  raid1_unplug+0x3c/0xf0 [raid1]
> [   37.566539]  __blk_flush_plug+0xbe/0x140
> [   37.567016]  blk_finish_plug+0x30/0x50
> [   37.567466]  io_submit_sqes+0x4c4/0x6c0
> [   37.567942]  ? syscall_exit_to_user_mode+0x81/0x270
> [   37.568530]  __do_sys_io_uring_enter+0x2f2/0x640
> [   37.569070]  __x64_sys_io_uring_enter+0x22/0x40
> [   37.569626]  x64_sys_call+0x20b9/0x24b0
> [   37.570071]  do_syscall_64+0x80/0x170
> [   37.570525]  ? vfs_read+0x240/0x380
> [   37.570937]  ? ksys_read+0xe6/0x100
> [   37.571360]  ? syscall_exit_to_user_mode+0x81/0x270
> [   37.571930]  ? do_syscall_64+0x8c/0x170
> [   37.572402]  ? syscall_exit_to_user_mode+0x81/0x270
> [   37.572969]  ? do_syscall_64+0x8c/0x170
> [   37.573438]  ? irqentry_exit+0x43/0x50
> [   37.573878]  ? clear_bhb_loop+0x15/0x70
> [   37.574348]  ? clear_bhb_loop+0x15/0x70
> [   37.574806]  ? clear_bhb_loop+0x15/0x70
> [   37.575282]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   37.575891] RIP: 0033:0x7bcc58f45b95
> [   37.576335] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
> d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
> 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
> [   37.578468] RSP: 002b:00007bcc47dfafd8 EFLAGS: 00000246 ORIG_RAX:
> 00000000000001aa
> [   37.579359] RAX: ffffffffffffffda RBX: 00007bcc400039d0 RCX:
> 00007bcc58f45b95
> [   37.580198] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> 0000000000000033
> [   37.581028] RBP: 00007bcc400039d8 R08: 0000000000000000 R09:
> 0000000000000008
> [   37.581864] R10: 0000000000000000 R11: 0000000000000246 R12:
> 00007bcc40003ac0
> [   37.582695] R13: 0000000000000000 R14: 0000591ad05f5c68 R15:
> 0000591b10198450
> [   37.583534]  </TASK>
> [   37.583812] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E)
> cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E)
> cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E)
> ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E)
> iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E)
> bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E)
> intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E)
> intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E)
> crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E)
> ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) hid_generic(E)
> aesni_intel(E) crypto_simd(E) cryptd(E) usbhid(E) hid(E) rapl(E)
> pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E)
> vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E)
> qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E)
> async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E)
> async_tx(E) xor(E) raid6_pq(E)
> [   37.583875]  libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E)
> psmouse(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E)
> ttm(E) ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E)
> [   37.595967] ---[ end trace 0000000000000000 ]---
> [   37.596555] RIP: 0010:blk_mq_submit_bio+0x6f5/0x750
> [   37.597144] Code: ff e9 aa fc ff ff e8 7a 3c a4 ff 48 89 de 4c 89 e7
> e8 9f ef ff ff e8 da 79 a4 ff e9 99 fc ff ff e8 d0 45 a7 ff e9 cd fa ff
> ff <0f> 0b 0f 0b 48 8b 03 48 8b 40 68 48 8b 40 40 e9 8c fb ff ff f6 43
> [   37.599320] RSP: 0018:ffffa2b480863a70 EFLAGS: 00010246
> [   37.599932] RAX: 0000000000000000 RBX: ffff971c0b29d000 RCX:
> 0000000000000000
> [   37.600757] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
> 0000000000000000
> [   37.601600] RBP: ffffa2b480863ae8 R08: 0000000000000000 R09:
> 0000000000000000
> [   37.602450] R10: 0000000000000000 R11: 0000000000000000 R12:
> ffff971c106ee600
> [   37.603299] R13: ffff971c0fb328f0 R14: ffff971c03b330c0 R15:
> 0000000000000001
> [   37.604129] FS:  00007bcc47e006c0(0000) GS:ffff971d37c00000(0000)
> knlGS:0000000000000000
> [   37.605079] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   37.605774] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4:
> 0000000000372ef0
> [   37.606620] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [   37.607464] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [   37.608335] ------------[ cut here ]------------
> [   37.608885] WARNING: CPU: 4 PID: 795 at kernel/exit.c:827
> do_exit+0x8a2/0xab0
> [   37.609754] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E)
> cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E)
> cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E)
> ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E)
> iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E)
> bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E)
> intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E)
> intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E)
> crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E)
> ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) hid_generic(E)
> aesni_intel(E) crypto_simd(E) cryptd(E) usbhid(E) hid(E) rapl(E)
> pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E)
> vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E)
> qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E)
> async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E)
> async_tx(E) xor(E) raid6_pq(E)
> [   37.609827]  libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E)
> psmouse(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E)
> ttm(E) ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E)
> [   37.622013] CPU: 4 PID: 795 Comm: kvm Tainted: G      D     E
> 6.9.0-troubleshoot1-dirty #28
> [   37.623045] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> [   37.624382] RIP: 0010:do_exit+0x8a2/0xab0
> [   37.624869] Code: 38 0c 00 00 e9 2d fe ff ff 49 8d 7d 18 e8 b6 83 07
> 00 e9 5a f8 ff ff 48 8b bb f8 09 00 00 31 f6 e8 b3 e0 ff ff e9 c1 fd ff
> ff <0f> 0b e9 d4 f7 ff ff 0f 0b e9 8f f7 ff ff 4c 89 e6 bf 05 06 00 00
> [   37.627117] RSP: 0018:ffffa2b480863ec8 EFLAGS: 00010286
> [   37.627780] RAX: 0000000000000000 RBX: ffff971c05458000 RCX:
> 0000000000000000
> [   37.628638] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
> 0000000000000000
> [   37.629512] RBP: ffffa2b480863f20 R08: 0000000000000000 R09:
> 0000000000000000
> [   37.630379] R10: 0000000000000000 R11: 0000000000000000 R12:
> 000000000000000b
> [   37.631231] R13: ffff971c04c8d100 R14: ffff971c00c28000 R15:
> ffffa2b4808639c8
> [   37.632095] FS:  00007bcc47e006c0(0000) GS:ffff971d37c00000(0000)
> knlGS:0000000000000000
> [   37.633090] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   37.633794] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4:
> 0000000000372ef0
> [   37.634666] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [   37.635533] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [   37.636387] Call Trace:
> [   37.636692]  <TASK>
> [   37.636963]  ? show_regs+0x6c/0x80
> [   37.637393]  ? __warn+0x88/0x140
> [   37.637801]  ? do_exit+0x8a2/0xab0
> [   37.638197]  ? report_bug+0x182/0x1b0
> [   37.638674]  ? handle_bug+0x46/0x90
> [   37.639094]  ? exc_invalid_op+0x18/0x80
> [   37.639577]  ? asm_exc_invalid_op+0x1b/0x20
> [   37.640090]  ? do_exit+0x8a2/0xab0
> [   37.640537]  ? do_exit+0x6f/0xab0
> [   37.640941]  ? syscall_exit_to_user_mode+0x81/0x270
> [   37.641542]  make_task_dead+0x81/0x160
> [   37.641999]  rewind_stack_and_make_dead+0x16/0x20
> [   37.642586] RIP: 0033:0x7bcc58f45b95
> [   37.643046] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87
> d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f
> 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24
> [   37.645251] RSP: 002b:00007bcc47dfafd8 EFLAGS: 00000246 ORIG_RAX:
> 00000000000001aa
> [   37.646213] RAX: ffffffffffffffda RBX: 00007bcc400039d0 RCX:
> 00007bcc58f45b95
> [   37.647117] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> 0000000000000033
> [   37.648018] RBP: 00007bcc400039d8 R08: 0000000000000000 R09:
> 0000000000000008
> [   37.648916] R10: 0000000000000000 R11: 0000000000000246 R12:
> 00007bcc40003ac0
> [   37.649824] R13: 0000000000000000 R14: 0000591ad05f5c68 R15:
> 0000591b10198450
> [   37.650760]  </TASK>
> [   37.651034] ---[ end trace 0000000000000000 ]---
>
Friedrich Weber May 28, 2024, 2:40 p.m. UTC | #9
On 28/05/2024 11:09, Chengming Zhou wrote:
> On 2024/5/28 16:42, Friedrich Weber wrote:
>> Hope I did this correctly. With this, the reproducer triggered a BUG
>> pretty quickly, see [0]. If I can provide anything else, just let me know.
> 
> Thanks for your patience, it's correct. Then how about this fix?
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d98654869615..b2ec5c4c738f 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -485,6 +485,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
>         if (data->nr_tags > 1) {
>                 rq = __blk_mq_alloc_requests_batch(data);
>                 if (rq) {
> +                       INIT_LIST_HEAD(&rq->queuelist);
>                         blk_mq_rq_time_init(rq, alloc_time_ns);
>                         return rq;
>                 }
> 

Nice, seems like with this patch applied on top of 6.9, the reproducer
does not trigger crashes anymore for me! Thanks!

To verify that the reproducer hits the new INIT_LIST_HEAD, I added debug
prints before/after:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4da581f13273..75186bb0d9c9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -485,7 +485,9 @@ static struct request
*__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
        if (data->nr_tags > 1) {
                rq = __blk_mq_alloc_requests_batch(data);
                if (rq) {
+                       pr_warn("before init: list: %p next: %p prev:
%p\n", &rq->queuelist, rq->queuelist.next, rq->queuelist.prev);
                        INIT_LIST_HEAD(&rq->queuelist);
+                       pr_warn("after init: list: %p next: %p prev:
%p\n", &rq->queuelist, rq->queuelist.next, rq->queuelist.prev);
                        blk_mq_rq_time_init(rq, alloc_time_ns);
                        return rq;
                }

And indeed, I see quite some printouts where rq->queuelist.next differs
before/after the request, e.g.

May 28 16:31:25 reproflushfull kernel: before init: list:
000000001e0a144f next: 00000000aaa2e372 prev: 000000001e0a144f
May 28 16:31:25 reproflushfull kernel: after init: list:
000000001e0a144f next: 000000001e0a144f prev: 000000001e0a144f
May 28 16:31:26 reproflushfull kernel: before init: list:
000000001e0a144f next: 00000000aaa2e372 prev: 000000001e0a144f
May 28 16:31:26 reproflushfull kernel: after init: list:
000000001e0a144f next: 000000001e0a144f prev: 000000001e0a144f

I know very little about the block layer, but if I understand the commit
message of the original 81ada09cc25e correctly, it's expected that
queuelist needs to be reinitialized at some places. I'm just a little
confused to see the same pointer 00000000aaa2e372 in two subsequent
"before init" printouts for the same queuelist 000000001e0a144f. Is this
expected too?

Also, just out of interest: Can you estimate whether this issue is
specific to software RAID setups, or could similar NULL pointer
dereferences also happen in setups without software RAID?

Best wishes,

Friedrich
Chengming Zhou May 29, 2024, 8:50 a.m. UTC | #10
On 2024/5/28 22:40, Friedrich Weber wrote:
> On 28/05/2024 11:09, Chengming Zhou wrote:
>> On 2024/5/28 16:42, Friedrich Weber wrote:
>>> Hope I did this correctly. With this, the reproducer triggered a BUG
>>> pretty quickly, see [0]. If I can provide anything else, just let me know.
>>
>> Thanks for your patience, it's correct. Then how about this fix?
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index d98654869615..b2ec5c4c738f 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -485,6 +485,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
>>         if (data->nr_tags > 1) {
>>                 rq = __blk_mq_alloc_requests_batch(data);
>>                 if (rq) {
>> +                       INIT_LIST_HEAD(&rq->queuelist);
>>                         blk_mq_rq_time_init(rq, alloc_time_ns);
>>                         return rq;
>>                 }
>>
> 
> Nice, seems like with this patch applied on top of 6.9, the reproducer
> does not trigger crashes anymore for me! Thanks!

Good news. Thanks.

> 
> To verify that the reproducer hits the new INIT_LIST_HEAD, I added debug
> prints before/after:
[...]
> And indeed, I see quite some printouts where rq->queuelist.next differs
> before/after the request, e.g.
> 
> May 28 16:31:25 reproflushfull kernel: before init: list:
> 000000001e0a144f next: 00000000aaa2e372 prev: 000000001e0a144f
> May 28 16:31:25 reproflushfull kernel: after init: list:
> 000000001e0a144f next: 000000001e0a144f prev: 000000001e0a144f
> May 28 16:31:26 reproflushfull kernel: before init: list:
> 000000001e0a144f next: 00000000aaa2e372 prev: 000000001e0a144f
> May 28 16:31:26 reproflushfull kernel: after init: list:
> 000000001e0a144f next: 000000001e0a144f prev: 000000001e0a144f
> 
> I know very little about the block layer, but if I understand the commit
> message of the original 81ada09cc25e correctly, it's expected that
> queuelist needs to be reinitialized at some places. I'm just a little

Yes, because we use list_move_tail() in the flush sequences. Maybe we can
just use list_add_tail() so we don't need the queuelist initialized. It
should be ok since rq can't be on any list when PREFLUSH or POSTFLUSH,
so there isn't any move actually.

But now I'm concerned that rq->queuelist maybe changed by driver after
request end? Don't know if this is a reasonable behavior, or I'm afraid
that the safest way is to revert the last patch. Want to know what Jens,
Ming and Christoph think?

> confused to see the same pointer 00000000aaa2e372 in two subsequent
> "before init" printouts for the same queuelist 000000001e0a144f. Is this
> expected too?
Not sure, but it seems possible. This is a rq_list in the plug cache,
000000001e0a144f is a PREFLUSH request, it will be freed after request end.
Then next time we again allocate it and put it in the plug cache,
just before 00000000aaa2e372 again. The reason why block doesn't use
00000000aaa2e372 maybe it's from a different queue or hardware queue.
But these are just my guess.

> 
> Also, just out of interest: Can you estimate whether this issue is
> specific to software RAID setups, or could similar NULL pointer
> dereferences also happen in setups without software RAID?

I think it can also happen without software RAID.

Thanks.
Christoph Hellwig May 31, 2024, 6:17 a.m. UTC | #11
On Wed, May 29, 2024 at 04:50:02PM +0800, Chengming Zhou wrote:
> Yes, because we use list_move_tail() in the flush sequences. Maybe we can
> just use list_add_tail() so we don't need the queuelist initialized. It
> should be ok since rq can't be on any list when PREFLUSH or POSTFLUSH,
> so there isn't any move actually.

Sounds good.

> But now I'm concerned that rq->queuelist maybe changed by driver after
> request end?

How could the driver change it?

> > Also, just out of interest: Can you estimate whether this issue is
> > specific to software RAID setups, or could similar NULL pointer
> > dereferences also happen in setups without software RAID?
> 
> I think it can also happen without software RAID.

Seems to be about batch allocation.  So you either need a plug in
the stacking device, or io_uring.  I guess people aren't using the
io_uring high performance options on devices with a write cache
all that much, as that should immediately reproduce the problem.
Chengming Zhou May 31, 2024, 8:16 a.m. UTC | #12
On 2024/5/31 14:17, Christoph Hellwig wrote:
> On Wed, May 29, 2024 at 04:50:02PM +0800, Chengming Zhou wrote:
>> Yes, because we use list_move_tail() in the flush sequences. Maybe we can
>> just use list_add_tail() so we don't need the queuelist initialized. It
>> should be ok since rq can't be on any list when PREFLUSH or POSTFLUSH,
>> so there isn't any move actually.
> 
> Sounds good.

Ok, I could send a fix that changes to use list_add_tail() later.

> 
>> But now I'm concerned that rq->queuelist maybe changed by driver after
>> request end?
> 
> How could the driver change it?

I don't know much about drivers. Normally, they will detach rq->queuelist
from their internal list and do blk_mq_end_request(), in which we reuse
this queuelist to add rq to the post-flush list.

Strictly speaking, that rq's ownership still belongs to the drivers until
they call blk_mq_free_request(), right? So I'm not sure if the drivers
would touch rq->queuelist after blk_mq_end_request(). If the drivers don't
have such behaviors, then we are good.

> 
>>> Also, just out of interest: Can you estimate whether this issue is
>>> specific to software RAID setups, or could similar NULL pointer
>>> dereferences also happen in setups without software RAID?
>>
>> I think it can also happen without software RAID.
> 
> Seems to be about batch allocation.  So you either need a plug in
> the stacking device, or io_uring.  I guess people aren't using the
> io_uring high performance options on devices with a write cache
> all that much, as that should immediately reproduce the problem.
>
diff mbox series

Patch

diff --git a/block/blk-flush.c b/block/blk-flush.c
index fedb39031647..e73dc22d05c1 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -183,14 +183,13 @@  static void blk_flush_complete_seq(struct request *rq,
 		/* queue for flush */
 		if (list_empty(pending))
 			fq->flush_pending_since = jiffies;
-		list_move_tail(&rq->flush.list, pending);
+		list_move_tail(&rq->queuelist, pending);
 		break;
 
 	case REQ_FSEQ_DATA:
-		list_del_init(&rq->flush.list);
 		fq->flush_data_in_flight++;
 		spin_lock(&q->requeue_lock);
-		list_add(&rq->queuelist, &q->requeue_list);
+		list_move(&rq->queuelist, &q->requeue_list);
 		spin_unlock(&q->requeue_lock);
 		blk_mq_kick_requeue_list(q);
 		break;
@@ -202,7 +201,7 @@  static void blk_flush_complete_seq(struct request *rq,
 		 * flush data request completion path.  Restore @rq for
 		 * normal completion and end it.
 		 */
-		list_del_init(&rq->flush.list);
+		list_del_init(&rq->queuelist);
 		blk_flush_restore_request(rq);
 		blk_mq_end_request(rq, error);
 		break;
@@ -258,7 +257,7 @@  static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
 	fq->flush_running_idx ^= 1;
 
 	/* and push the waiting requests to the next stage */
-	list_for_each_entry_safe(rq, n, running, flush.list) {
+	list_for_each_entry_safe(rq, n, running, queuelist) {
 		unsigned int seq = blk_flush_cur_seq(rq);
 
 		BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
@@ -292,7 +291,7 @@  static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 {
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	struct request *first_rq =
-		list_first_entry(pending, struct request, flush.list);
+		list_first_entry(pending, struct request, queuelist);
 	struct request *flush_rq = fq->flush_rq;
 
 	/* C1 described at the top of this file */
@@ -376,6 +375,11 @@  static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
 	 */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
 	fq->flush_data_in_flight--;
+	/*
+	 * May have been corrupted by rq->rq_next reuse, we need to
+	 * re-initialize rq->queuelist before reusing it here.
+	 */
+	INIT_LIST_HEAD(&rq->queuelist);
 	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
 	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 
@@ -386,7 +390,6 @@  static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
 static void blk_rq_init_flush(struct request *rq)
 {
 	rq->flush.seq = 0;
-	INIT_LIST_HEAD(&rq->flush.list);
 	rq->rq_flags |= RQF_FLUSH_SEQ;
 	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 	rq->end_io = mq_flush_data_end_io;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 67f810857634..01e8c31db665 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -178,7 +178,6 @@  struct request {
 
 	struct {
 		unsigned int		seq;
-		struct list_head	list;
 		rq_end_io_fn		*saved_end_io;
 	} flush;