diff mbox series

BUG: KASAN: stack-out-of-bounds in unwind_next_frame+0x1df5/0x2650

Message ID CABWYdi3HjduhY-nQXzy2ezGbiMB1Vk9cnhW2pMypUa+P1OjtzQ@mail.gmail.com (mailing list archive)
State New, archived
Headers show
Series BUG: KASAN: stack-out-of-bounds in unwind_next_frame+0x1df5/0x2650 | expand

Commit Message

Ivan Babrou Jan. 29, 2021, 3:35 a.m. UTC
Hello,

We've noticed the following regression in Linux 5.10 branch:

[  128.367231][    C0]
==================================================================
[  128.368523][    C0] BUG: KASAN: stack-out-of-bounds in
unwind_next_frame (arch/x86/kernel/unwind_orc.c:371
arch/x86/kernel/unwind_orc.c:544)
[  128.369744][    C0] Read of size 8 at addr ffff88802fceede0 by task
kworker/u2:2/591
[  128.370916][    C0]
[  128.371269][    C0] CPU: 0 PID: 591 Comm: kworker/u2:2 Not tainted
5.10.11-cloudflare-kasan-2021.1.15 #1
[  128.372626][    C0] Hardware name: QEMU Standard PC (i440FX + PIIX,
1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
[  128.374346][    C0] Workqueue: writeback wb_workfn (flush-254:0)
[  128.375275][    C0] Call Trace:
[  128.375763][    C0]  <IRQ>
[  128.376221][    C0]  dump_stack+0x7d/0xa3
[  128.376843][    C0]  print_address_description.constprop.0+0x1c/0x210
[  128.377827][    C0]  ? _raw_spin_lock_irqsave
(arch/x86/include/asm/atomic.h:202
include/asm-generic/atomic-instrumented.h:707
include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:195
include/linux/spinlock_api_smp.h:119 kernel/locking/spinlock.c:159)
[  128.378624][    C0]  ? _raw_write_unlock_bh (kernel/locking/spinlock.c:158)
[  128.379389][    C0]  ? unwind_next_frame (arch/x86/kernel/unwind_orc.c:444)
[  128.380177][    C0]  ? unwind_next_frame
(arch/x86/kernel/unwind_orc.c:371 arch/x86/kernel/unwind_orc.c:544)
[  128.380954][    C0]  ? unwind_next_frame
(arch/x86/kernel/unwind_orc.c:371 arch/x86/kernel/unwind_orc.c:544)
[  128.381736][    C0]  kasan_report.cold+0x1f/0x37
[  128.382438][    C0]  ? unwind_next_frame
(arch/x86/kernel/unwind_orc.c:371 arch/x86/kernel/unwind_orc.c:544)
[  128.383192][    C0]  unwind_next_frame+0x1df5/0x2650
[  128.383954][    C0]  ? asm_common_interrupt
(arch/x86/include/asm/idtentry.h:622)
[  128.384726][    C0]  ? get_stack_info_noinstr
(arch/x86/kernel/dumpstack_64.c:157)
[  128.385530][    C0]  ? glue_xts_req_128bit+0x110/0x6f0 glue_helper
[  128.386509][    C0]  ? deref_stack_reg (arch/x86/kernel/unwind_orc.c:418)
[  128.387267][    C0]  ? is_module_text_address (kernel/module.c:4566
kernel/module.c:4550)
[  128.388077][    C0]  ? glue_xts_req_128bit+0x110/0x6f0 glue_helper
[  128.389048][    C0]  ? kernel_text_address.part.0 (kernel/extable.c:145)
[  128.389901][    C0]  ? glue_xts_req_128bit+0x110/0x6f0 glue_helper
[  128.390865][    C0]  ? stack_trace_save (kernel/stacktrace.c:82)
[  128.391550][    C0]  arch_stack_walk+0x8d/0xf0
[  128.392216][    C0]  ? kfree (mm/slub.c:3142 mm/slub.c:4124)
[  128.392807][    C0]  stack_trace_save+0x96/0xd0
[  128.393535][    C0]  ? create_prof_cpu_mask (kernel/stacktrace.c:113)
[  128.394320][    C0]  ? blk_update_request (block/blk-core.c:264
block/blk-core.c:1468)
[  128.395113][    C0]  ? asm_call_irq_on_stack (arch/x86/entry/entry_64.S:796)
[  128.395887][    C0]  ? do_softirq_own_stack
(arch/x86/include/asm/irq_stack.h:27
arch/x86/include/asm/irq_stack.h:77 arch/x86/kernel/irq_64.c:77)
[  128.396678][    C0]  ? irq_exit_rcu (kernel/softirq.c:393
kernel/softirq.c:423 kernel/softirq.c:435)
[  128.397349][    C0]  ? common_interrupt (arch/x86/kernel/irq.c:239)
[  128.398086][    C0]  ? asm_common_interrupt
(arch/x86/include/asm/idtentry.h:622)
[  128.398886][    C0]  ? get_page_from_freelist (mm/page_alloc.c:3480
mm/page_alloc.c:3904)
[  128.399759][    C0]  kasan_save_stack+0x20/0x50
[  128.400453][    C0]  ? kasan_save_stack (mm/kasan/common.c:48)
[  128.401175][    C0]  ? kasan_set_track (mm/kasan/common.c:56)
[  128.401881][    C0]  ? kasan_set_free_info (mm/kasan/generic.c:360)
[  128.402646][    C0]  ? __kasan_slab_free (mm/kasan/common.c:283
mm/kasan/common.c:424)
[  128.403375][    C0]  ? slab_free_freelist_hook (mm/slub.c:1577)
[  128.404199][    C0]  ? kfree (mm/slub.c:3142 mm/slub.c:4124)
[  128.404835][    C0]  ? nvme_pci_complete_rq+0x105/0x350 nvme
[  128.405765][    C0]  ? blk_done_softirq (include/linux/list.h:282
block/blk-mq.c:581)
[  128.406552][    C0]  ? __do_softirq
(arch/x86/include/asm/jump_label.h:25 include/linux/jump_label.h:200
include/trace/events/irq.h:142 kernel/softirq.c:299)
[  128.407272][    C0]  ? asm_call_irq_on_stack (arch/x86/entry/entry_64.S:796)
[  128.408087][    C0]  ? do_softirq_own_stack
(arch/x86/include/asm/irq_stack.h:27
arch/x86/include/asm/irq_stack.h:77 arch/x86/kernel/irq_64.c:77)
[  128.408878][    C0]  ? irq_exit_rcu (kernel/softirq.c:393
kernel/softirq.c:423 kernel/softirq.c:435)
[  128.409602][    C0]  ? common_interrupt (arch/x86/kernel/irq.c:239)
[  128.410366][    C0]  ? asm_common_interrupt
(arch/x86/include/asm/idtentry.h:622)
[  128.411184][    C0]  ? skcipher_walk_next (crypto/skcipher.c:322
crypto/skcipher.c:384)
[  128.412009][    C0]  ? skcipher_walk_virt (crypto/skcipher.c:487)
[  128.412811][    C0]  ? glue_xts_req_128bit+0x110/0x6f0 glue_helper
[  128.413792][    C0]  ? asm_common_interrupt
(arch/x86/include/asm/idtentry.h:622)
[  128.414562][    C0]  ? kcryptd_crypt_write_convert+0x3a2/0xa10 dm_crypt
[  128.415591][    C0]  ? crypt_map+0x5c1/0xc70 dm_crypt
[  128.416389][    C0]  ? __map_bio.isra.0+0x109/0x450 dm_mod
[  128.417275][    C0]  ? __split_and_process_non_flush+0x728/0xd10 dm_mod
[  128.418293][    C0]  ? dm_submit_bio+0x4f1/0xec0 dm_mod
[  128.419068][    C0]  ? submit_bio_noacct (block/blk-core.c:934
block/blk-core.c:982 block/blk-core.c:1061)
[  128.419806][    C0]  ? submit_bio (block/blk-core.c:1079)
[  128.420458][    C0]  ? _raw_spin_lock_irqsave
(arch/x86/include/asm/atomic.h:202
include/asm-generic/atomic-instrumented.h:707
include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:195
include/linux/spinlock_api_smp.h:119 kernel/locking/spinlock.c:159)
[  128.421244][    C0]  ? _raw_write_unlock_bh (kernel/locking/spinlock.c:158)
[  128.422015][    C0]  ? ret_from_fork (arch/x86/entry/entry_64.S:302)
[  128.422696][    C0]  ? kmem_cache_free (mm/slub.c:3142 mm/slub.c:3158)
[  128.423427][    C0]  ? memset (mm/kasan/common.c:84)
[  128.424000][    C0]  ? dma_pool_free (mm/dmapool.c:405)
[  128.424698][    C0]  ? slab_free_freelist_hook (mm/slub.c:1577)
[  128.425518][    C0]  ? dma_pool_create (mm/dmapool.c:405)
[  128.426234][    C0]  ? kmem_cache_free (mm/slub.c:3142 mm/slub.c:3158)
[  128.426923][    C0]  ? raise_softirq_irqoff
(arch/x86/include/asm/preempt.h:26 kernel/softirq.c:469)
[  128.427691][    C0]  kasan_set_track+0x1c/0x30
[  128.428366][    C0]  kasan_set_free_info+0x1b/0x30
[  128.429113][    C0]  __kasan_slab_free+0x110/0x150
[  128.429838][    C0]  slab_free_freelist_hook+0x66/0x120
[  128.430628][    C0]  kfree+0xbf/0x4d0
[  128.431192][    C0]  ? nvme_pci_complete_rq+0x105/0x350 nvme
[  128.432107][    C0]  ? nvme_unmap_data+0x349/0x440 nvme
[  128.432882][    C0]  nvme_pci_complete_rq+0x105/0x350 nvme
[  128.433750][    C0]  blk_done_softirq+0x2ff/0x590
[  128.434441][    C0]  ? blk_mq_stop_hw_queue (block/blk-mq.c:573)
[  128.435161][    C0]  ? _raw_spin_lock_bh (kernel/locking/spinlock.c:150)
[  128.435894][    C0]  ? _raw_spin_lock_bh (kernel/locking/spinlock.c:150)
[  128.436582][    C0]  __do_softirq+0x1a0/0x667
[  128.437218][    C0]  asm_call_irq_on_stack+0x12/0x20
[  128.437975][    C0]  </IRQ>
[  128.438397][    C0]  do_softirq_own_stack+0x37/0x40
[  128.439120][    C0]  irq_exit_rcu+0x110/0x1b0
[  128.439807][    C0]  common_interrupt+0x74/0x120
[  128.440545][    C0]  asm_common_interrupt+0x1e/0x40
[  128.441287][    C0] RIP: 0010:skcipher_walk_next
(crypto/skcipher.c:322 crypto/skcipher.c:384)
[  128.442126][    C0] Code: 85 dd 10 00 00 49 8d 7c 24 08 49 89 14 24
48 b9 00 00 00 00 00 fc ff df 41 81 e5 ff 0f 00 00 48 89 fe 48 c1 ee
03 80 3c 0e 00 <0f> 85 80 10 00 00 48 89 c6 4d 89 6c 24 08 48 bc
All code
========
   0: 85 dd                test   %ebx,%ebp
   2: 10 00                adc    %al,(%rax)
   4: 00 49 8d              add    %cl,-0x73(%rcx)
   7: 7c 24                jl     0x2d
   9: 08 49 89              or     %cl,-0x77(%rcx)
   c: 14 24                adc    $0x24,%al
   e: 48 b9 00 00 00 00 00 movabs $0xdffffc0000000000,%rcx
  15: fc ff df
  18: 41 81 e5 ff 0f 00 00 and    $0xfff,%r13d
  1f: 48 89 fe              mov    %rdi,%rsi
  22: 48 c1 ee 03          shr    $0x3,%rsi
  26: 80 3c 0e 00          cmpb   $0x0,(%rsi,%rcx,1)
  2a:* 0f 85 80 10 00 00    jne    0x10b0 <-- trapping instruction
  30: 48 89 c6              mov    %rax,%rsi
  33: 4d 89 6c 24 08        mov    %r13,0x8(%r12)
  38: 48                    rex.W
  39: bc                    .byte 0xbc

Code starting with the faulting instruction
===========================================
   0: 0f 85 80 10 00 00    jne    0x1086
   6: 48 89 c6              mov    %rax,%rsi
   9: 4d 89 6c 24 08        mov    %r13,0x8(%r12)
   e: 48                    rex.W
   f: bc                    .byte 0xbc
[  128.445089][    C0] RSP: 0018:ffff88802fceebf0 EFLAGS: 00000246
[  128.445969][    C0] RAX: ffff888003b571b8 RBX: 0000000000000000
RCX: dffffc0000000000
[  128.447124][    C0] RDX: ffffea00017cd580 RSI: 1ffff11005f9dda8
RDI: ffff88802fceed40
[  128.448281][    C0] RBP: ffff88802fceec70 R08: ffff88802fceedc4
R09: 00000000ffffffee
[  128.449457][    C0] R10: 0000000000000000 R11: 1ffff11005f9ddaf
R12: ffff88802fceed38
[  128.450641][    C0] R13: 0000000000000000 R14: ffff888003b57138
R15: ffff88802fceedc8
[  128.451827][    C0]  ? arch_stack_walk (arch/x86/kernel/stacktrace.c:24)
[  128.452482][    C0]  skcipher_walk_virt+0x4be/0x7e0
[  128.453242][    C0]  glue_xts_req_128bit+0x110/0x6f0 glue_helper
[  128.454175][    C0]  ? aesni_set_key+0x1e0/0x1e0 aesni_intel
[  128.455042][    C0]  ? irq_exit_rcu (kernel/softirq.c:406
kernel/softirq.c:425 kernel/softirq.c:435)
[  128.455719][    C0]  ? glue_xts_crypt_128bit_one+0x280/0x280 glue_helper
[  128.456753][    C0]  asm_common_interrupt+0x1e/0x40
[  128.457530][    C0] RIP: b8fa2500:0xdffffc0000000000
[  128.458305][    C0] Code: Unable to access opcode bytes at RIP
0xdffffbffffffffd6.

Code starting with the faulting instruction
===========================================
[  128.459443][    C0] RSP: 974be3f3:ffff88809c437290 EFLAGS: 00000004
ORIG_RAX: 0000001000000010
[  128.460755][    C0] RAX: 0000000000000000 RBX: ffff888003b571b8
RCX: 0000000000000000
[  128.461967][    C0] RDX: ffff888003b57240 RSI: ffff888003b57240
RDI: ffffffe000000010
[  128.463152][    C0] RBP: dffffc0000000200 R08: 0000000000000801
R09: ffffea0001123480
[  128.464345][    C0] R10: ffffed1000000200 R11: ffffffff00000000
R12: ffff888000000000
[  128.465522][    C0] R13: ffff888003b57138 R14: ffff88809c437290
R15: ffffea00002c5b08
[  128.466710][    C0]  ? get_page_from_freelist (mm/page_alloc.c:3913)
[  128.467560][    C0]  ? worker_thread (include/linux/list.h:282
kernel/workqueue.c:2419)
[  128.468279][    C0]  ? kthread (kernel/kthread.c:292)
[  128.468919][    C0]  ? ret_from_fork (arch/x86/entry/entry_64.S:302)
[  128.469607][    C0]  ? __writeback_inodes_wb (fs/fs-writeback.c:1793)
[  128.470418][    C0]  ? wb_writeback (fs/fs-writeback.c:1898)
[  128.471145][    C0]  ? process_one_work
(arch/x86/include/asm/jump_label.h:25 include/linux/jump_label.h:200
include/trace/events/workqueue.h:108 kernel/workqueue.c:2277)
[  128.471930][    C0]  ? worker_thread (include/linux/list.h:282
kernel/workqueue.c:2419)
[  128.472668][    C0]  ? ret_from_fork (arch/x86/entry/entry_64.S:302)
[  128.473329][    C0]  ? __zone_watermark_ok (mm/page_alloc.c:3793)
[  128.474065][    C0]  ? __kasan_kmalloc.constprop.0
(mm/kasan/common.c:56 mm/kasan/common.c:461)
[  128.474914][    C0]  ? crypt_convert+0x27e5/0x4530 dm_crypt
[  128.475796][    C0]  ? mempool_alloc (mm/mempool.c:392)
[  128.476493][    C0]  ? crypt_iv_tcw_ctr+0x4a0/0x4a0 dm_crypt
[  128.477433][    C0]  ? bio_add_page (block/bio.c:943)
[  128.478129][    C0]  ? __bio_try_merge_page (block/bio.c:935)
[  128.478923][    C0]  ? bio_associate_blkg (block/blk-cgroup.c:1869)
[  128.479693][    C0]  ? kcryptd_crypt_write_convert+0x581/0xa10 dm_crypt
[  128.480721][    C0]  ? crypt_map+0x5c1/0xc70 dm_crypt
[  128.481527][    C0]  ? bio_clone_blkg_association (block/blk-cgroup.c:1883)
[  128.482426][    C0]  ? __map_bio.isra.0+0x109/0x450 dm_mod
[  128.483310][    C0]  ? __split_and_process_non_flush+0x728/0xd10 dm_mod
[  128.484354][    C0]  ? __send_empty_flush+0x4b0/0x4b0 dm_mod
[  128.485223][    C0]  ? __part_start_io_acct (block/blk-core.c:1336)
[  128.486009][    C0]  ? dm_submit_bio+0x4f1/0xec0 dm_mod
[  128.486829][    C0]  ? __split_and_process_non_flush+0xd10/0xd10 dm_mod
[  128.487915][    C0]  ? submit_bio_noacct (block/blk-core.c:934
block/blk-core.c:982 block/blk-core.c:1061)
[  128.488686][    C0]  ? _cond_resched (kernel/sched/core.c:6124)
[  128.489388][    C0]  ? blk_queue_enter (block/blk-core.c:1044)
[  128.490300][    C0]  ? iomap_readahead (fs/iomap/buffered-io.c:1438)
[  128.491041][    C0]  ? write_one_page (mm/page-writeback.c:2171)
[  128.491759][    C0]  ? submit_bio (block/blk-core.c:1079)
[  128.492432][    C0]  ? submit_bio_noacct (block/blk-core.c:1079)
[  128.493248][    C0]  ? _raw_spin_lock
(arch/x86/include/asm/atomic.h:202
include/asm-generic/atomic-instrumented.h:707
include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:183
include/linux/spinlock_api_smp.h:143 kernel/locking/spinlock.c:151)
[  128.493975][    C0]  ? iomap_submit_ioend (fs/iomap/buffered-io.c:1215)
[  128.494761][    C0]  ? xfs_vm_writepages (fs/xfs/xfs_aops.c:578)
[  128.495529][    C0]  ? xfs_dax_writepages (fs/xfs/xfs_aops.c:578)
[  128.496278][    C0]  ? __blk_mq_do_dispatch_sched
(block/blk-mq-sched.c:135 (discriminator 1))
[  128.497120][    C0]  ? do_writepages (mm/page-writeback.c:2355)
[  128.497831][    C0]  ? page_writeback_cpu_online (mm/page-writeback.c:2345)
[  128.498681][    C0]  ? _raw_spin_lock
(arch/x86/include/asm/atomic.h:202
include/asm-generic/atomic-instrumented.h:707
include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:183
include/linux/spinlock_api_smp.h:143 kernel/locking/spinlock.c:151)
[  128.499405][    C0]  ? wake_up_bit (kernel/sched/wait_bit.c:15
kernel/sched/wait_bit.c:149)
[  128.500072][    C0]  ? __writeback_single_inode (fs/fs-writeback.c:1470)
[  128.500908][    C0]  ? writeback_sb_inodes (fs/fs-writeback.c:1725)
[  128.501703][    C0]  ? __writeback_single_inode (fs/fs-writeback.c:1634)
[  128.502571][    C0]  ? finish_writeback_work.constprop.0
(fs/fs-writeback.c:1242)
[  128.503525][    C0]  ? __writeback_inodes_wb (fs/fs-writeback.c:1793)
[  128.504336][    C0]  ? wb_writeback (fs/fs-writeback.c:1898)
[  128.505031][    C0]  ? __writeback_inodes_wb (fs/fs-writeback.c:1846)
[  128.505902][    C0]  ? cpumask_next (lib/cpumask.c:24)
[  128.506570][    C0]  ? get_nr_dirty_inodes (fs/inode.c:94 fs/inode.c:102)
[  128.507348][    C0]  ? wb_workfn (fs/fs-writeback.c:2054
fs/fs-writeback.c:2082)
[  128.508014][    C0]  ? dequeue_entity (kernel/sched/fair.c:4347)
[  128.508744][    C0]  ? inode_wait_for_writeback (fs/fs-writeback.c:2065)
[  128.509586][    C0]  ? put_prev_entity (kernel/sched/fair.c:4501)
[  128.510300][    C0]  ? __switch_to
(arch/x86/include/asm/bitops.h:55
include/asm-generic/bitops/instrumented-atomic.h:29
include/linux/thread_info.h:55 arch/x86/include/asm/fpu/internal.h:572
arch/x86/kernel/process_64.c:598)
[  128.510990][    C0]  ? __switch_to_asm (arch/x86/entry/entry_64.S:255)
[  128.511695][    C0]  ? __schedule (kernel/sched/core.c:3782
kernel/sched/core.c:4528)
[  128.512373][    C0]  ? process_one_work
(arch/x86/include/asm/jump_label.h:25 include/linux/jump_label.h:200
include/trace/events/workqueue.h:108 kernel/workqueue.c:2277)
[  128.513133][    C0]  ? worker_thread (include/linux/list.h:282
kernel/workqueue.c:2419)
[  128.513850][    C0]  ? rescuer_thread (kernel/workqueue.c:2361)
[  128.514566][    C0]  ? kthread (kernel/kthread.c:292)
[  128.515200][    C0]  ? __kthread_bind_mask (kernel/kthread.c:245)
[  128.515960][    C0]  ? ret_from_fork (arch/x86/entry/entry_64.S:302)
[  128.516641][    C0]
[  128.516983][    C0] The buggy address belongs to the page:
[  128.517838][    C0] page:000000007a390a2b refcount:0 mapcount:0
mapping:0000000000000000 index:0x0 pfn:0x2fcee
[  128.519428][    C0] flags: 0x1ffff800000000()
[  128.520102][    C0] raw: 001ffff800000000 ffffea0000bf3b88
ffffea0000bf3b88 0000000000000000
[  128.521396][    C0] raw: 0000000000000000 0000000000000000
00000000ffffffff 0000000000000000
[  128.522673][    C0] page dumped because: kasan: bad access detected
[  128.523642][    C0]
[  128.523984][    C0] addr ffff88802fceede0 is located in stack of
task kworker/u2:2/591 at offset 216 in frame:
[  128.525503][    C0]  glue_xts_req_128bit+0x0/0x6f0 glue_helper
[  128.526390][    C0]
[  128.526745][    C0] this frame has 5 objects:
[  128.527405][    C0]  [48, 200) 'walk'
[  128.527407][    C0]  [272, 304) 'b'
[  128.527969][    C0]  [336, 400) 's'
[  128.528509][    C0]  [432, 496) 'd'
[  128.529047][    C0]  [528, 608) 'subreq'
[  128.529607][    C0]
[  128.530568][    C0] Memory state around the buggy address:
[  128.531443][    C0]  ffff88802fceec80: 00 00 00 00 00 00 00 00 00
00 00 00 00 00 00 00
[  128.532708][    C0]  ffff88802fceed00: 00 f1 f1 f1 f1 f1 f1 00 00
00 00 00 00 00 00 00
[  128.533911][    C0] >ffff88802fceed80: 00 00 00 00 00 00 00 00 00
00 f2 f2 f2 f2 f2 f2
[  128.535106][    C0]                                                        ^
[  128.536197][    C0]  ffff88802fceee00: f2 f2 f2 00 00 00 00 f2 f2
f2 f2 00 00 00 00 00
[  128.537404][    C0]  ffff88802fceee80: 00 00 00 f2 f2 f2 f2 00 00
00 00 00 00 00 00 f2

There are other stacks that end in the same place without dm-crypt
involvement, but they are much harder for us to reproduce, so let's
stick with this one.

After some bisecting from myself and Ignat, we were able to find the
commit that fixes the issue, which is:

* https://github.com/torvalds/linux/commit/ce8f86ee94fabcc98537ddccd7e82cfd360a4dc5?w=1

mm/page_alloc: add a missing mm_page_alloc_zone_locked() tracepoint

The trace point *trace_mm_page_alloc_zone_locked()* in __rmqueue() does
not currently cover all branches.  Add the missing tracepoint and check
the page before do that.

We don't have CONFIG_CMA enabled, so it can be distilled to:

$ git diff HEAD^..HEAD

If I apply this patch on top of 5.10.11, the issue disappears.

I can't say I understand the connection here.

It's worth mentioning that the issue doesn't reproduce with
UNWINDER_FRAME_POINTER rather than UNWINDER_ORC. This fact makes me
think that ORC is to blame here somehow, but it's beyond my
understanding.

Here's how I replicate the issue in qemu running Debian Buster:

# /tmp is tmpfs in our case
$ qemu-img create -f qcow2 /tmp/nvme-$USER.img 10G

$ sudo qemu-system-x86_64 -smp 1 -m 3G -enable-kvm -cpu host -kernel
~/vmlinuz -initrd ~/initrd.img -nographic -device e1000 -device
nvme,drive=nvme0,serial=deadbeaf1,num_queues=8 -drive
file=/tmp/nvme-$USER.img,if=none,id=nvme0 -append 'console=ttyS0
kasan_multi_shot'

Inside of the VM:

root@localhost:~# echo -e '[Match]\nName=enp*\n[Network]\nDHCP=yes' >
/etc/systemd/network/00-dhcp.network
root@localhost:~# systemctl restart systemd-networkd
root@localhost:~# apt-get update
root@localhost:~# apt-get install -y --no-install-recommends cryptsetup
root@localhost:~# echo potato > keyfile
root@localhost:~# chmod 0400 keyfile
root@localhost:~# cryptsetup -q luksFormat /dev/nvme0n1 keyfile
root@localhost:~# cryptsetup open --type luks --key-file keyfile
--disable-keyring /dev/nvme0n1 luks-nvme0n1
root@localhost:~# dmsetup table /dev/mapper/luks-nvme0n1 | sed 's/$/ 2
no_read_workqueue no_write_workqueue/' | dmsetup reload
/dev/mapper/luks-nvme0n1
root@localhost:~# dmsetup suspend /dev/mapper/luks-nvme0n1 && dmsetup
resume /dev/mapper/luks-nvme0n1
root@localhost:~# mkfs.xfs -f /dev/mapper/luks-nvme0n1
root@localhost:~# mount /dev/mapper/luks-nvme0n1 /mnt

The workload that triggers the KASAN complaint is the following:

root@localhost:~# while true; do rm -f /mnt/random.data.target && dd
if=/dev/zero of=/mnt/random.data bs=10M count=400 status=progress &&
mv /mnt/random.data /mnt/random.data.target; sleep 1; done

It might take a few iterations to trigger.

Note that dmcrypt setup in our case depends on Ignat's patches, which
are included in 5.10.11 and 5.11-rc5, so during bisection between
5.11-rc3 and 5.11-rc4 they needed to be reapplied.

I'm going to ask for a backport of the "fix" to stable, but it feels
like there's a bigger issue here.

Comments

Ivan Babrou Feb. 3, 2021, 3:09 a.m. UTC | #1
On Thu, Jan 28, 2021 at 7:35 PM Ivan Babrou <ivan@cloudflare.com> wrote:
>
> Hello,
>
> We've noticed the following regression in Linux 5.10 branch:
>
> [  128.367231][    C0]
> ==================================================================
> [  128.368523][    C0] BUG: KASAN: stack-out-of-bounds in
> unwind_next_frame (arch/x86/kernel/unwind_orc.c:371
> arch/x86/kernel/unwind_orc.c:544)
> [  128.369744][    C0] Read of size 8 at addr ffff88802fceede0 by task
> kworker/u2:2/591
> [  128.370916][    C0]
> [  128.371269][    C0] CPU: 0 PID: 591 Comm: kworker/u2:2 Not tainted
> 5.10.11-cloudflare-kasan-2021.1.15 #1
> [  128.372626][    C0] Hardware name: QEMU Standard PC (i440FX + PIIX,
> 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
> [  128.374346][    C0] Workqueue: writeback wb_workfn (flush-254:0)
> [  128.375275][    C0] Call Trace:
> [  128.375763][    C0]  <IRQ>
> [  128.376221][    C0]  dump_stack+0x7d/0xa3
> [  128.376843][    C0]  print_address_description.constprop.0+0x1c/0x210
> [  128.377827][    C0]  ? _raw_spin_lock_irqsave
> (arch/x86/include/asm/atomic.h:202
> include/asm-generic/atomic-instrumented.h:707
> include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:195
> include/linux/spinlock_api_smp.h:119 kernel/locking/spinlock.c:159)
> [  128.378624][    C0]  ? _raw_write_unlock_bh (kernel/locking/spinlock.c:158)
> [  128.379389][    C0]  ? unwind_next_frame (arch/x86/kernel/unwind_orc.c:444)
> [  128.380177][    C0]  ? unwind_next_frame
> (arch/x86/kernel/unwind_orc.c:371 arch/x86/kernel/unwind_orc.c:544)
> [  128.380954][    C0]  ? unwind_next_frame
> (arch/x86/kernel/unwind_orc.c:371 arch/x86/kernel/unwind_orc.c:544)
> [  128.381736][    C0]  kasan_report.cold+0x1f/0x37
> [  128.382438][    C0]  ? unwind_next_frame
> (arch/x86/kernel/unwind_orc.c:371 arch/x86/kernel/unwind_orc.c:544)
> [  128.383192][    C0]  unwind_next_frame+0x1df5/0x2650
> [  128.383954][    C0]  ? asm_common_interrupt
> (arch/x86/include/asm/idtentry.h:622)
> [  128.384726][    C0]  ? get_stack_info_noinstr
> (arch/x86/kernel/dumpstack_64.c:157)
> [  128.385530][    C0]  ? glue_xts_req_128bit+0x110/0x6f0 glue_helper
> [  128.386509][    C0]  ? deref_stack_reg (arch/x86/kernel/unwind_orc.c:418)
> [  128.387267][    C0]  ? is_module_text_address (kernel/module.c:4566
> kernel/module.c:4550)
> [  128.388077][    C0]  ? glue_xts_req_128bit+0x110/0x6f0 glue_helper
> [  128.389048][    C0]  ? kernel_text_address.part.0 (kernel/extable.c:145)
> [  128.389901][    C0]  ? glue_xts_req_128bit+0x110/0x6f0 glue_helper
> [  128.390865][    C0]  ? stack_trace_save (kernel/stacktrace.c:82)
> [  128.391550][    C0]  arch_stack_walk+0x8d/0xf0
> [  128.392216][    C0]  ? kfree (mm/slub.c:3142 mm/slub.c:4124)
> [  128.392807][    C0]  stack_trace_save+0x96/0xd0
> [  128.393535][    C0]  ? create_prof_cpu_mask (kernel/stacktrace.c:113)
> [  128.394320][    C0]  ? blk_update_request (block/blk-core.c:264
> block/blk-core.c:1468)
> [  128.395113][    C0]  ? asm_call_irq_on_stack (arch/x86/entry/entry_64.S:796)
> [  128.395887][    C0]  ? do_softirq_own_stack
> (arch/x86/include/asm/irq_stack.h:27
> arch/x86/include/asm/irq_stack.h:77 arch/x86/kernel/irq_64.c:77)
> [  128.396678][    C0]  ? irq_exit_rcu (kernel/softirq.c:393
> kernel/softirq.c:423 kernel/softirq.c:435)
> [  128.397349][    C0]  ? common_interrupt (arch/x86/kernel/irq.c:239)
> [  128.398086][    C0]  ? asm_common_interrupt
> (arch/x86/include/asm/idtentry.h:622)
> [  128.398886][    C0]  ? get_page_from_freelist (mm/page_alloc.c:3480
> mm/page_alloc.c:3904)
> [  128.399759][    C0]  kasan_save_stack+0x20/0x50
> [  128.400453][    C0]  ? kasan_save_stack (mm/kasan/common.c:48)
> [  128.401175][    C0]  ? kasan_set_track (mm/kasan/common.c:56)
> [  128.401881][    C0]  ? kasan_set_free_info (mm/kasan/generic.c:360)
> [  128.402646][    C0]  ? __kasan_slab_free (mm/kasan/common.c:283
> mm/kasan/common.c:424)
> [  128.403375][    C0]  ? slab_free_freelist_hook (mm/slub.c:1577)
> [  128.404199][    C0]  ? kfree (mm/slub.c:3142 mm/slub.c:4124)
> [  128.404835][    C0]  ? nvme_pci_complete_rq+0x105/0x350 nvme
> [  128.405765][    C0]  ? blk_done_softirq (include/linux/list.h:282
> block/blk-mq.c:581)
> [  128.406552][    C0]  ? __do_softirq
> (arch/x86/include/asm/jump_label.h:25 include/linux/jump_label.h:200
> include/trace/events/irq.h:142 kernel/softirq.c:299)
> [  128.407272][    C0]  ? asm_call_irq_on_stack (arch/x86/entry/entry_64.S:796)
> [  128.408087][    C0]  ? do_softirq_own_stack
> (arch/x86/include/asm/irq_stack.h:27
> arch/x86/include/asm/irq_stack.h:77 arch/x86/kernel/irq_64.c:77)
> [  128.408878][    C0]  ? irq_exit_rcu (kernel/softirq.c:393
> kernel/softirq.c:423 kernel/softirq.c:435)
> [  128.409602][    C0]  ? common_interrupt (arch/x86/kernel/irq.c:239)
> [  128.410366][    C0]  ? asm_common_interrupt
> (arch/x86/include/asm/idtentry.h:622)
> [  128.411184][    C0]  ? skcipher_walk_next (crypto/skcipher.c:322
> crypto/skcipher.c:384)
> [  128.412009][    C0]  ? skcipher_walk_virt (crypto/skcipher.c:487)
> [  128.412811][    C0]  ? glue_xts_req_128bit+0x110/0x6f0 glue_helper
> [  128.413792][    C0]  ? asm_common_interrupt
> (arch/x86/include/asm/idtentry.h:622)
> [  128.414562][    C0]  ? kcryptd_crypt_write_convert+0x3a2/0xa10 dm_crypt
> [  128.415591][    C0]  ? crypt_map+0x5c1/0xc70 dm_crypt
> [  128.416389][    C0]  ? __map_bio.isra.0+0x109/0x450 dm_mod
> [  128.417275][    C0]  ? __split_and_process_non_flush+0x728/0xd10 dm_mod
> [  128.418293][    C0]  ? dm_submit_bio+0x4f1/0xec0 dm_mod
> [  128.419068][    C0]  ? submit_bio_noacct (block/blk-core.c:934
> block/blk-core.c:982 block/blk-core.c:1061)
> [  128.419806][    C0]  ? submit_bio (block/blk-core.c:1079)
> [  128.420458][    C0]  ? _raw_spin_lock_irqsave
> (arch/x86/include/asm/atomic.h:202
> include/asm-generic/atomic-instrumented.h:707
> include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:195
> include/linux/spinlock_api_smp.h:119 kernel/locking/spinlock.c:159)
> [  128.421244][    C0]  ? _raw_write_unlock_bh (kernel/locking/spinlock.c:158)
> [  128.422015][    C0]  ? ret_from_fork (arch/x86/entry/entry_64.S:302)
> [  128.422696][    C0]  ? kmem_cache_free (mm/slub.c:3142 mm/slub.c:3158)
> [  128.423427][    C0]  ? memset (mm/kasan/common.c:84)
> [  128.424000][    C0]  ? dma_pool_free (mm/dmapool.c:405)
> [  128.424698][    C0]  ? slab_free_freelist_hook (mm/slub.c:1577)
> [  128.425518][    C0]  ? dma_pool_create (mm/dmapool.c:405)
> [  128.426234][    C0]  ? kmem_cache_free (mm/slub.c:3142 mm/slub.c:3158)
> [  128.426923][    C0]  ? raise_softirq_irqoff
> (arch/x86/include/asm/preempt.h:26 kernel/softirq.c:469)
> [  128.427691][    C0]  kasan_set_track+0x1c/0x30
> [  128.428366][    C0]  kasan_set_free_info+0x1b/0x30
> [  128.429113][    C0]  __kasan_slab_free+0x110/0x150
> [  128.429838][    C0]  slab_free_freelist_hook+0x66/0x120
> [  128.430628][    C0]  kfree+0xbf/0x4d0
> [  128.431192][    C0]  ? nvme_pci_complete_rq+0x105/0x350 nvme
> [  128.432107][    C0]  ? nvme_unmap_data+0x349/0x440 nvme
> [  128.432882][    C0]  nvme_pci_complete_rq+0x105/0x350 nvme
> [  128.433750][    C0]  blk_done_softirq+0x2ff/0x590
> [  128.434441][    C0]  ? blk_mq_stop_hw_queue (block/blk-mq.c:573)
> [  128.435161][    C0]  ? _raw_spin_lock_bh (kernel/locking/spinlock.c:150)
> [  128.435894][    C0]  ? _raw_spin_lock_bh (kernel/locking/spinlock.c:150)
> [  128.436582][    C0]  __do_softirq+0x1a0/0x667
> [  128.437218][    C0]  asm_call_irq_on_stack+0x12/0x20
> [  128.437975][    C0]  </IRQ>
> [  128.438397][    C0]  do_softirq_own_stack+0x37/0x40
> [  128.439120][    C0]  irq_exit_rcu+0x110/0x1b0
> [  128.439807][    C0]  common_interrupt+0x74/0x120
> [  128.440545][    C0]  asm_common_interrupt+0x1e/0x40
> [  128.441287][    C0] RIP: 0010:skcipher_walk_next
> (crypto/skcipher.c:322 crypto/skcipher.c:384)
> [  128.442126][    C0] Code: 85 dd 10 00 00 49 8d 7c 24 08 49 89 14 24
> 48 b9 00 00 00 00 00 fc ff df 41 81 e5 ff 0f 00 00 48 89 fe 48 c1 ee
> 03 80 3c 0e 00 <0f> 85 80 10 00 00 48 89 c6 4d 89 6c 24 08 48 bc
> All code
> ========
>    0: 85 dd                test   %ebx,%ebp
>    2: 10 00                adc    %al,(%rax)
>    4: 00 49 8d              add    %cl,-0x73(%rcx)
>    7: 7c 24                jl     0x2d
>    9: 08 49 89              or     %cl,-0x77(%rcx)
>    c: 14 24                adc    $0x24,%al
>    e: 48 b9 00 00 00 00 00 movabs $0xdffffc0000000000,%rcx
>   15: fc ff df
>   18: 41 81 e5 ff 0f 00 00 and    $0xfff,%r13d
>   1f: 48 89 fe              mov    %rdi,%rsi
>   22: 48 c1 ee 03          shr    $0x3,%rsi
>   26: 80 3c 0e 00          cmpb   $0x0,(%rsi,%rcx,1)
>   2a:* 0f 85 80 10 00 00    jne    0x10b0 <-- trapping instruction
>   30: 48 89 c6              mov    %rax,%rsi
>   33: 4d 89 6c 24 08        mov    %r13,0x8(%r12)
>   38: 48                    rex.W
>   39: bc                    .byte 0xbc
>
> Code starting with the faulting instruction
> ===========================================
>    0: 0f 85 80 10 00 00    jne    0x1086
>    6: 48 89 c6              mov    %rax,%rsi
>    9: 4d 89 6c 24 08        mov    %r13,0x8(%r12)
>    e: 48                    rex.W
>    f: bc                    .byte 0xbc
> [  128.445089][    C0] RSP: 0018:ffff88802fceebf0 EFLAGS: 00000246
> [  128.445969][    C0] RAX: ffff888003b571b8 RBX: 0000000000000000
> RCX: dffffc0000000000
> [  128.447124][    C0] RDX: ffffea00017cd580 RSI: 1ffff11005f9dda8
> RDI: ffff88802fceed40
> [  128.448281][    C0] RBP: ffff88802fceec70 R08: ffff88802fceedc4
> R09: 00000000ffffffee
> [  128.449457][    C0] R10: 0000000000000000 R11: 1ffff11005f9ddaf
> R12: ffff88802fceed38
> [  128.450641][    C0] R13: 0000000000000000 R14: ffff888003b57138
> R15: ffff88802fceedc8
> [  128.451827][    C0]  ? arch_stack_walk (arch/x86/kernel/stacktrace.c:24)
> [  128.452482][    C0]  skcipher_walk_virt+0x4be/0x7e0
> [  128.453242][    C0]  glue_xts_req_128bit+0x110/0x6f0 glue_helper
> [  128.454175][    C0]  ? aesni_set_key+0x1e0/0x1e0 aesni_intel
> [  128.455042][    C0]  ? irq_exit_rcu (kernel/softirq.c:406
> kernel/softirq.c:425 kernel/softirq.c:435)
> [  128.455719][    C0]  ? glue_xts_crypt_128bit_one+0x280/0x280 glue_helper
> [  128.456753][    C0]  asm_common_interrupt+0x1e/0x40
> [  128.457530][    C0] RIP: b8fa2500:0xdffffc0000000000
> [  128.458305][    C0] Code: Unable to access opcode bytes at RIP
> 0xdffffbffffffffd6.
>
> Code starting with the faulting instruction
> ===========================================
> [  128.459443][    C0] RSP: 974be3f3:ffff88809c437290 EFLAGS: 00000004
> ORIG_RAX: 0000001000000010
> [  128.460755][    C0] RAX: 0000000000000000 RBX: ffff888003b571b8
> RCX: 0000000000000000
> [  128.461967][    C0] RDX: ffff888003b57240 RSI: ffff888003b57240
> RDI: ffffffe000000010
> [  128.463152][    C0] RBP: dffffc0000000200 R08: 0000000000000801
> R09: ffffea0001123480
> [  128.464345][    C0] R10: ffffed1000000200 R11: ffffffff00000000
> R12: ffff888000000000
> [  128.465522][    C0] R13: ffff888003b57138 R14: ffff88809c437290
> R15: ffffea00002c5b08
> [  128.466710][    C0]  ? get_page_from_freelist (mm/page_alloc.c:3913)
> [  128.467560][    C0]  ? worker_thread (include/linux/list.h:282
> kernel/workqueue.c:2419)
> [  128.468279][    C0]  ? kthread (kernel/kthread.c:292)
> [  128.468919][    C0]  ? ret_from_fork (arch/x86/entry/entry_64.S:302)
> [  128.469607][    C0]  ? __writeback_inodes_wb (fs/fs-writeback.c:1793)
> [  128.470418][    C0]  ? wb_writeback (fs/fs-writeback.c:1898)
> [  128.471145][    C0]  ? process_one_work
> (arch/x86/include/asm/jump_label.h:25 include/linux/jump_label.h:200
> include/trace/events/workqueue.h:108 kernel/workqueue.c:2277)
> [  128.471930][    C0]  ? worker_thread (include/linux/list.h:282
> kernel/workqueue.c:2419)
> [  128.472668][    C0]  ? ret_from_fork (arch/x86/entry/entry_64.S:302)
> [  128.473329][    C0]  ? __zone_watermark_ok (mm/page_alloc.c:3793)
> [  128.474065][    C0]  ? __kasan_kmalloc.constprop.0
> (mm/kasan/common.c:56 mm/kasan/common.c:461)
> [  128.474914][    C0]  ? crypt_convert+0x27e5/0x4530 dm_crypt
> [  128.475796][    C0]  ? mempool_alloc (mm/mempool.c:392)
> [  128.476493][    C0]  ? crypt_iv_tcw_ctr+0x4a0/0x4a0 dm_crypt
> [  128.477433][    C0]  ? bio_add_page (block/bio.c:943)
> [  128.478129][    C0]  ? __bio_try_merge_page (block/bio.c:935)
> [  128.478923][    C0]  ? bio_associate_blkg (block/blk-cgroup.c:1869)
> [  128.479693][    C0]  ? kcryptd_crypt_write_convert+0x581/0xa10 dm_crypt
> [  128.480721][    C0]  ? crypt_map+0x5c1/0xc70 dm_crypt
> [  128.481527][    C0]  ? bio_clone_blkg_association (block/blk-cgroup.c:1883)
> [  128.482426][    C0]  ? __map_bio.isra.0+0x109/0x450 dm_mod
> [  128.483310][    C0]  ? __split_and_process_non_flush+0x728/0xd10 dm_mod
> [  128.484354][    C0]  ? __send_empty_flush+0x4b0/0x4b0 dm_mod
> [  128.485223][    C0]  ? __part_start_io_acct (block/blk-core.c:1336)
> [  128.486009][    C0]  ? dm_submit_bio+0x4f1/0xec0 dm_mod
> [  128.486829][    C0]  ? __split_and_process_non_flush+0xd10/0xd10 dm_mod
> [  128.487915][    C0]  ? submit_bio_noacct (block/blk-core.c:934
> block/blk-core.c:982 block/blk-core.c:1061)
> [  128.488686][    C0]  ? _cond_resched (kernel/sched/core.c:6124)
> [  128.489388][    C0]  ? blk_queue_enter (block/blk-core.c:1044)
> [  128.490300][    C0]  ? iomap_readahead (fs/iomap/buffered-io.c:1438)
> [  128.491041][    C0]  ? write_one_page (mm/page-writeback.c:2171)
> [  128.491759][    C0]  ? submit_bio (block/blk-core.c:1079)
> [  128.492432][    C0]  ? submit_bio_noacct (block/blk-core.c:1079)
> [  128.493248][    C0]  ? _raw_spin_lock
> (arch/x86/include/asm/atomic.h:202
> include/asm-generic/atomic-instrumented.h:707
> include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:183
> include/linux/spinlock_api_smp.h:143 kernel/locking/spinlock.c:151)
> [  128.493975][    C0]  ? iomap_submit_ioend (fs/iomap/buffered-io.c:1215)
> [  128.494761][    C0]  ? xfs_vm_writepages (fs/xfs/xfs_aops.c:578)
> [  128.495529][    C0]  ? xfs_dax_writepages (fs/xfs/xfs_aops.c:578)
> [  128.496278][    C0]  ? __blk_mq_do_dispatch_sched
> (block/blk-mq-sched.c:135 (discriminator 1))
> [  128.497120][    C0]  ? do_writepages (mm/page-writeback.c:2355)
> [  128.497831][    C0]  ? page_writeback_cpu_online (mm/page-writeback.c:2345)
> [  128.498681][    C0]  ? _raw_spin_lock
> (arch/x86/include/asm/atomic.h:202
> include/asm-generic/atomic-instrumented.h:707
> include/asm-generic/qspinlock.h:82 include/linux/spinlock.h:183
> include/linux/spinlock_api_smp.h:143 kernel/locking/spinlock.c:151)
> [  128.499405][    C0]  ? wake_up_bit (kernel/sched/wait_bit.c:15
> kernel/sched/wait_bit.c:149)
> [  128.500072][    C0]  ? __writeback_single_inode (fs/fs-writeback.c:1470)
> [  128.500908][    C0]  ? writeback_sb_inodes (fs/fs-writeback.c:1725)
> [  128.501703][    C0]  ? __writeback_single_inode (fs/fs-writeback.c:1634)
> [  128.502571][    C0]  ? finish_writeback_work.constprop.0
> (fs/fs-writeback.c:1242)
> [  128.503525][    C0]  ? __writeback_inodes_wb (fs/fs-writeback.c:1793)
> [  128.504336][    C0]  ? wb_writeback (fs/fs-writeback.c:1898)
> [  128.505031][    C0]  ? __writeback_inodes_wb (fs/fs-writeback.c:1846)
> [  128.505902][    C0]  ? cpumask_next (lib/cpumask.c:24)
> [  128.506570][    C0]  ? get_nr_dirty_inodes (fs/inode.c:94 fs/inode.c:102)
> [  128.507348][    C0]  ? wb_workfn (fs/fs-writeback.c:2054
> fs/fs-writeback.c:2082)
> [  128.508014][    C0]  ? dequeue_entity (kernel/sched/fair.c:4347)
> [  128.508744][    C0]  ? inode_wait_for_writeback (fs/fs-writeback.c:2065)
> [  128.509586][    C0]  ? put_prev_entity (kernel/sched/fair.c:4501)
> [  128.510300][    C0]  ? __switch_to
> (arch/x86/include/asm/bitops.h:55
> include/asm-generic/bitops/instrumented-atomic.h:29
> include/linux/thread_info.h:55 arch/x86/include/asm/fpu/internal.h:572
> arch/x86/kernel/process_64.c:598)
> [  128.510990][    C0]  ? __switch_to_asm (arch/x86/entry/entry_64.S:255)
> [  128.511695][    C0]  ? __schedule (kernel/sched/core.c:3782
> kernel/sched/core.c:4528)
> [  128.512373][    C0]  ? process_one_work
> (arch/x86/include/asm/jump_label.h:25 include/linux/jump_label.h:200
> include/trace/events/workqueue.h:108 kernel/workqueue.c:2277)
> [  128.513133][    C0]  ? worker_thread (include/linux/list.h:282
> kernel/workqueue.c:2419)
> [  128.513850][    C0]  ? rescuer_thread (kernel/workqueue.c:2361)
> [  128.514566][    C0]  ? kthread (kernel/kthread.c:292)
> [  128.515200][    C0]  ? __kthread_bind_mask (kernel/kthread.c:245)
> [  128.515960][    C0]  ? ret_from_fork (arch/x86/entry/entry_64.S:302)
> [  128.516641][    C0]
> [  128.516983][    C0] The buggy address belongs to the page:
> [  128.517838][    C0] page:000000007a390a2b refcount:0 mapcount:0
> mapping:0000000000000000 index:0x0 pfn:0x2fcee
> [  128.519428][    C0] flags: 0x1ffff800000000()
> [  128.520102][    C0] raw: 001ffff800000000 ffffea0000bf3b88
> ffffea0000bf3b88 0000000000000000
> [  128.521396][    C0] raw: 0000000000000000 0000000000000000
> 00000000ffffffff 0000000000000000
> [  128.522673][    C0] page dumped because: kasan: bad access detected
> [  128.523642][    C0]
> [  128.523984][    C0] addr ffff88802fceede0 is located in stack of
> task kworker/u2:2/591 at offset 216 in frame:
> [  128.525503][    C0]  glue_xts_req_128bit+0x0/0x6f0 glue_helper
> [  128.526390][    C0]
> [  128.526745][    C0] this frame has 5 objects:
> [  128.527405][    C0]  [48, 200) 'walk'
> [  128.527407][    C0]  [272, 304) 'b'
> [  128.527969][    C0]  [336, 400) 's'
> [  128.528509][    C0]  [432, 496) 'd'
> [  128.529047][    C0]  [528, 608) 'subreq'
> [  128.529607][    C0]
> [  128.530568][    C0] Memory state around the buggy address:
> [  128.531443][    C0]  ffff88802fceec80: 00 00 00 00 00 00 00 00 00
> 00 00 00 00 00 00 00
> [  128.532708][    C0]  ffff88802fceed00: 00 f1 f1 f1 f1 f1 f1 00 00
> 00 00 00 00 00 00 00
> [  128.533911][    C0] >ffff88802fceed80: 00 00 00 00 00 00 00 00 00
> 00 f2 f2 f2 f2 f2 f2
> [  128.535106][    C0]                                                        ^
> [  128.536197][    C0]  ffff88802fceee00: f2 f2 f2 00 00 00 00 f2 f2
> f2 f2 00 00 00 00 00
> [  128.537404][    C0]  ffff88802fceee80: 00 00 00 f2 f2 f2 f2 00 00
> 00 00 00 00 00 00 f2
>
> There are other stacks that end in the same place without dm-crypt
> involvement, but they are much harder for us to reproduce, so let's
> stick with this one.
>
> After some bisecting from myself and Ignat, we were able to find the
> commit that fixes the issue, which is:
>
> * https://github.com/torvalds/linux/commit/ce8f86ee94fabcc98537ddccd7e82cfd360a4dc5?w=1
>
> mm/page_alloc: add a missing mm_page_alloc_zone_locked() tracepoint
>
> The trace point *trace_mm_page_alloc_zone_locked()* in __rmqueue() does
> not currently cover all branches.  Add the missing tracepoint and check
> the page before do that.
>
> We don't have CONFIG_CMA enabled, so it can be distilled to:
>
> $ git diff HEAD^..HEAD
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 14b9e83ff9da..b5961d530929 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2871,7 +2871,8 @@ __rmqueue(struct zone *zone, unsigned int order,
> int migratetype,
>                         goto retry;
>         }
>
> -       trace_mm_page_alloc_zone_locked(page, order, migratetype);
> +       if (page)
> +               trace_mm_page_alloc_zone_locked(page, order, migratetype);
>         return page;
>  }
>
> If I apply this patch on top of 5.10.11, the issue disappears.
>
> I can't say I understand the connection here.
>
> It's worth mentioning that the issue doesn't reproduce with
> UNWINDER_FRAME_POINTER rather than UNWINDER_ORC. This fact makes me
> think that ORC is to blame here somehow, but it's beyond my
> understanding.
>
> Here's how I replicate the issue in qemu running Debian Buster:
>
> # /tmp is tmpfs in our case
> $ qemu-img create -f qcow2 /tmp/nvme-$USER.img 10G
>
> $ sudo qemu-system-x86_64 -smp 1 -m 3G -enable-kvm -cpu host -kernel
> ~/vmlinuz -initrd ~/initrd.img -nographic -device e1000 -device
> nvme,drive=nvme0,serial=deadbeaf1,num_queues=8 -drive
> file=/tmp/nvme-$USER.img,if=none,id=nvme0 -append 'console=ttyS0
> kasan_multi_shot'
>
> Inside of the VM:
>
> root@localhost:~# echo -e '[Match]\nName=enp*\n[Network]\nDHCP=yes' >
> /etc/systemd/network/00-dhcp.network
> root@localhost:~# systemctl restart systemd-networkd
> root@localhost:~# apt-get update
> root@localhost:~# apt-get install -y --no-install-recommends cryptsetup
> root@localhost:~# echo potato > keyfile
> root@localhost:~# chmod 0400 keyfile
> root@localhost:~# cryptsetup -q luksFormat /dev/nvme0n1 keyfile
> root@localhost:~# cryptsetup open --type luks --key-file keyfile
> --disable-keyring /dev/nvme0n1 luks-nvme0n1
> root@localhost:~# dmsetup table /dev/mapper/luks-nvme0n1 | sed 's/$/ 2
> no_read_workqueue no_write_workqueue/' | dmsetup reload
> /dev/mapper/luks-nvme0n1
> root@localhost:~# dmsetup suspend /dev/mapper/luks-nvme0n1 && dmsetup
> resume /dev/mapper/luks-nvme0n1
> root@localhost:~# mkfs.xfs -f /dev/mapper/luks-nvme0n1
> root@localhost:~# mount /dev/mapper/luks-nvme0n1 /mnt
>
> The workload that triggers the KASAN complaint is the following:
>
> root@localhost:~# while true; do rm -f /mnt/random.data.target && dd
> if=/dev/zero of=/mnt/random.data bs=10M count=400 status=progress &&
> mv /mnt/random.data /mnt/random.data.target; sleep 1; done
>
> It might take a few iterations to trigger.
>
> Note that dmcrypt setup in our case depends on Ignat's patches, which
> are included in 5.10.11 and 5.11-rc5, so during bisection between
> 5.11-rc3 and 5.11-rc4 they needed to be reapplied.
>
> I'm going to ask for a backport of the "fix" to stable, but it feels
> like there's a bigger issue here.

Hello again and the first hello for new people in CC as I have an update,

(Please let me know if I should get the list of people to CC not from
get_maintainers.pl, since it gave me a lot of people and it doesn't
feel right.)

We've seen the issue even after backporting ce8f86ee94fa, this time
much later in uptime, outside of dm-crypt and without a reliable
reproduction.

I noticed that the bug doesn't reproduce on Linux v5.9, so I went
ahead and bisected v5.9..v5.10-rc1 to see where it all started (with
dm-crypt reproduction).

Since there's a ton of merges and regular bisect gave me questionable
results, I had to resort to --first-parent first, which pointed at
dd502a81077a:

$ git bisect log
git bisect start '--first-parent'
# bad: [3650b228f83adda7e5ee532e2b90429c03f7b9ec] Linux 5.10-rc1
git bisect bad 3650b228f83adda7e5ee532e2b90429c03f7b9ec
# good: [bbf5c979011a099af5dc76498918ed7df445635b] Linux 5.9
git bisect good bbf5c979011a099af5dc76498918ed7df445635b
# bad: [578a7155c5a1894a789d4ece181abf9d25dc6b0d] Merge tag
'linux-kselftest-kunit-fixes-5.10-rc1' of
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
git bisect bad 578a7155c5a1894a789d4ece181abf9d25dc6b0d
# bad: [3ad11d7ac8872b1c8da54494721fad8907ee41f7] Merge tag
'block-5.10-2020-10-12' of git://git.kernel.dk/linux-block
git bisect bad 3ad11d7ac8872b1c8da54494721fad8907ee41f7
# bad: [b85cac574592b843c4be93c83303feeee0c4dc25] Merge tag
'x86-kaslr-2020-10-12' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect bad b85cac574592b843c4be93c83303feeee0c4dc25
# good: [64743e652cea9d6df4264caaa1d7f95273024afb] Merge tag
'x86_cache_for_v5.10' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect good 64743e652cea9d6df4264caaa1d7f95273024afb
# good: [edaa5ddf3833669a25654d42c0fb653dfdd906df] Merge tag
'sched-core-2020-10-12' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect good edaa5ddf3833669a25654d42c0fb653dfdd906df
# good: [34eb62d868d729e9a252aa497277081fb652eeed] Merge tag
'core-build-2020-10-12' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect good 34eb62d868d729e9a252aa497277081fb652eeed
# bad: [3bff6112c80cecb76af5fe485506f96e8adb6122] Merge tag
'perf-core-2020-10-12' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect bad 3bff6112c80cecb76af5fe485506f96e8adb6122
# bad: [dd502a81077a5f3b3e19fa9a1accffdcab5ad5bc] Merge tag
'core-static_call-2020-10-12' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect bad dd502a81077a5f3b3e19fa9a1accffdcab5ad5bc
# first bad commit: [dd502a81077a5f3b3e19fa9a1accffdcab5ad5bc] Merge
tag 'core-static_call-2020-10-12' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Since core-static_call-2020-10-12 tag is based on top of 5.9-rc3, I
rebased it on v5.9 and repeated the bisect between that and v5.9:

$ git checkout core-static_call-2020-10-12
$ git rebase v5.9
$ git checkout -b ivan/static_call-2020-10-12-rebase-on-v5.9

$ git bisect log
git bisect start
# bad: [6c2fc089268777994dd82ce7c60263f3a71ed0b4] static_call: Fix
return type of static_call_init
git bisect bad 6c2fc089268777994dd82ce7c60263f3a71ed0b4
# good: [bbf5c979011a099af5dc76498918ed7df445635b] Linux 5.9
git bisect good bbf5c979011a099af5dc76498918ed7df445635b
# good: [580b6f7a0af7823277b3ec9aeb2ff48596c10662] x86/static_call:
Add inline static call implementation for x86-64
git bisect good 580b6f7a0af7823277b3ec9aeb2ff48596c10662
# good: [574169ad2d8ce8a80d2798e502d289f6741d8096] static_call: Add
some validation
git bisect good 574169ad2d8ce8a80d2798e502d289f6741d8096
# bad: [4c9c8903fcfb8fca9ab84a8906ee23c998086549] x86/perf,
static_call: Optimize x86_pmu methods
git bisect bad 4c9c8903fcfb8fca9ab84a8906ee23c998086549
# bad: [edfd9b7838ba5e47f19ad8466d0565aba5c59bf0] tracepoint: Optimize
using static_call()
git bisect bad edfd9b7838ba5e47f19ad8466d0565aba5c59bf0
# good: [a5ea9249fde1027124f7ae42d6ca17d53fcb3df0] static_call: Allow early init
git bisect good a5ea9249fde1027124f7ae42d6ca17d53fcb3df0
# first bad commit: [edfd9b7838ba5e47f19ad8466d0565aba5c59bf0]
tracepoint: Optimize using static_call()

edfd9b7838ba5e47f19ad8466d0565aba5c59bf0 is the first bad commit
commit edfd9b7838ba5e47f19ad8466d0565aba5c59bf0
Author: Steven Rostedt (VMware) <rostedt@goodmis.org>
Date:   Tue Aug 18 15:57:52 2020 +0200

    tracepoint: Optimize using static_call()

    Currently the tracepoint site will iterate a vector and issue indirect
    calls to however many handlers are registered (ie. the vector is
    long).

    Using static_call() it is possible to optimize this for the common
    case of only having a single handler registered. In this case the
    static_call() can directly call this handler. Otherwise, if the vector
    is longer than 1, call a function that iterates the whole vector like
    the current code.

    [peterz: updated to new interface]

    Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Link: https://lore.kernel.org/r/20200818135805.279421092@infradead.org

 include/linux/tracepoint-defs.h |  5 +++
 include/linux/tracepoint.h      | 86 +++++++++++++++++++++++++++++------------
 include/trace/define_trace.h    | 14 +++----
 kernel/tracepoint.c             | 25 ++++++++++--
 4 files changed, 94 insertions(+), 36 deletions(-)

Upstream commit hash is d25e37d89dd2:

* https://github.com/torvalds/linux/commit/d25e37d89dd2

I double checked and its parent (a945c8345ec0) works fine.

Note that the "fix" for 5.10.11 was also tracepoint related:

* https://github.com/torvalds/linux/commit/ce8f86ee94fa

Let me know how I can help get this fixed or debugged further. I'm
happy to try patches.
Peter Zijlstra Feb. 3, 2021, 4:46 p.m. UTC | #2
On Tue, Feb 02, 2021 at 07:09:44PM -0800, Ivan Babrou wrote:
> On Thu, Jan 28, 2021 at 7:35 PM Ivan Babrou <ivan@cloudflare.com> wrote:

> > ==================================================================
> > [  128.368523][    C0] BUG: KASAN: stack-out-of-bounds in
> > unwind_next_frame (arch/x86/kernel/unwind_orc.c:371
> > arch/x86/kernel/unwind_orc.c:544)
> > [  128.369744][    C0] Read of size 8 at addr ffff88802fceede0 by task
> > kworker/u2:2/591

Can you pretty please not line-wrap console output? It's unreadable.

> edfd9b7838ba5e47f19ad8466d0565aba5c59bf0 is the first bad commit
> commit edfd9b7838ba5e47f19ad8466d0565aba5c59bf0

Not sure what tree you're on, but that's not the upstream commit.

> Author: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Date:   Tue Aug 18 15:57:52 2020 +0200
> 
>     tracepoint: Optimize using static_call()
> 

There's a known issue with that patch, can you try:

  http://lkml.kernel.org/r/20210202220121.435051654@goodmis.org
Ivan Babrou Feb. 3, 2021, 5:46 p.m. UTC | #3
> Can you pretty please not line-wrap console output? It's unreadable.

GMail doesn't make it easy, I'll send a link to a pastebin next time.
Let me know if you'd like me to regenerate the decoded stack.

> > edfd9b7838ba5e47f19ad8466d0565aba5c59bf0 is the first bad commit
> > commit edfd9b7838ba5e47f19ad8466d0565aba5c59bf0
>
> Not sure what tree you're on, but that's not the upstream commit.

I mentioned that it's a rebased core-static_call-2020-10-12 tag and
added a link to the upstream hash right below.

> > Author: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > Date:   Tue Aug 18 15:57:52 2020 +0200
> >
> >     tracepoint: Optimize using static_call()
> >
>
> There's a known issue with that patch, can you try:
>
>   http://lkml.kernel.org/r/20210202220121.435051654@goodmis.org

I've tried it on top of core-static_call-2020-10-12 tag rebased on top
of v5.9 (to make it reproducible), and the patch did not help. Do I
need to apply the whole series or something else?
Josh Poimboeuf Feb. 3, 2021, 7:05 p.m. UTC | #4
On Wed, Feb 03, 2021 at 09:46:55AM -0800, Ivan Babrou wrote:
> > Can you pretty please not line-wrap console output? It's unreadable.
> 
> GMail doesn't make it easy, I'll send a link to a pastebin next time.
> Let me know if you'd like me to regenerate the decoded stack.
> 
> > > edfd9b7838ba5e47f19ad8466d0565aba5c59bf0 is the first bad commit
> > > commit edfd9b7838ba5e47f19ad8466d0565aba5c59bf0
> >
> > Not sure what tree you're on, but that's not the upstream commit.
> 
> I mentioned that it's a rebased core-static_call-2020-10-12 tag and
> added a link to the upstream hash right below.
> 
> > > Author: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > Date:   Tue Aug 18 15:57:52 2020 +0200
> > >
> > >     tracepoint: Optimize using static_call()
> > >
> >
> > There's a known issue with that patch, can you try:
> >
> >   http://lkml.kernel.org/r/20210202220121.435051654@goodmis.org
> 
> I've tried it on top of core-static_call-2020-10-12 tag rebased on top
> of v5.9 (to make it reproducible), and the patch did not help. Do I
> need to apply the whole series or something else?

Can you recreate with this patch, and add "unwind_debug" to the cmdline?
It will spit out a bunch of stack data.


From: Josh Poimboeuf <jpoimboe@redhat.com>
Subject: [PATCH] Subject: [PATCH] x86/unwind: Add 'unwind_debug' cmdline
 option

Sometimes the one-line ORC unwinder warnings aren't very helpful.  Take
the existing frame pointer unwind_dump() and make it useful for all
unwinders.

I don't want to be too aggressive about enabling the dumps, so for now
they're only enabled with the use of a new 'unwind_debug' cmdline
option.  When enabled, it will dump the full contents of the stack when
an error condition is encountered, or when dump_stack() is called.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 .../admin-guide/kernel-parameters.txt         |  6 +++
 arch/x86/include/asm/unwind.h                 |  3 ++
 arch/x86/kernel/dumpstack.c                   | 39 ++++++++++++++
 arch/x86/kernel/unwind_frame.c                | 51 +++----------------
 arch/x86/kernel/unwind_orc.c                  |  5 +-
 5 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3d6604a949f8..d29689aa62a2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5521,6 +5521,12 @@
 	unknown_nmi_panic
 			[X86] Cause panic on unknown NMI.
 
+	unwind_debug	[X86-64]
+			Enable unwinder debug output.  This can be
+			useful for debugging certain unwinder error
+			conditions, including corrupt stacks and
+			bad/missing unwinder metadata.
+
 	usbcore.authorized_default=
 			[USB] Default USB device authorization:
 			(default -1 = authorized except for wireless USB,
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 70fc159ebe69..5101d7ef7912 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -123,4 +123,7 @@ static inline bool task_on_another_cpu(struct task_struct *task)
 #endif
 }
 
+extern bool unwind_debug __ro_after_init;
+void unwind_dump(struct unwind_state *state);
+
 #endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 299c20f0a38b..febfd5b7f62a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -29,6 +29,42 @@ static int die_counter;
 
 static struct pt_regs exec_summary_regs;
 
+bool unwind_debug __ro_after_init;
+static int __init unwind_debug_cmdline(char *str)
+{
+	unwind_debug = true;
+	return 0;
+}
+early_param("unwind_debug", unwind_debug_cmdline);
+
+void unwind_dump(struct unwind_state *state)
+{
+	unsigned long word, *sp;
+	struct stack_info stack_info = {0};
+	unsigned long visit_mask = 0;
+
+	printk_deferred("unwinder dump: stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
+			state->stack_info.type, state->stack_info.next_sp,
+			state->stack_mask, state->graph_idx);
+
+	sp = state->task == current ? __builtin_frame_address(0)
+				    : (void *)state->task->thread.sp;
+
+	for (; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+		if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
+			break;
+
+		for (; sp < stack_info.end; sp++) {
+
+			word = READ_ONCE_NOCHECK(*sp);
+
+			printk_deferred("%0*lx: %0*lx (%pB)\n", BITS_PER_LONG/4,
+					(unsigned long)sp, BITS_PER_LONG/4,
+					word, (void *)word);
+		}
+	}
+}
+
 bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
 			   struct stack_info *info)
 {
@@ -301,6 +337,9 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (stack_name)
 			printk("%s </%s>\n", log_lvl, stack_name);
 	}
+
+	if (unwind_debug)
+		unwind_dump(&state);
 }
 
 void show_stack(struct task_struct *task, unsigned long *sp,
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d7c44b257f7f..6bcdf6ecad65 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -28,48 +28,6 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
 	return state->regs ? &state->regs->ip : state->bp + 1;
 }
 
-static void unwind_dump(struct unwind_state *state)
-{
-	static bool dumped_before = false;
-	bool prev_zero, zero = false;
-	unsigned long word, *sp;
-	struct stack_info stack_info = {0};
-	unsigned long visit_mask = 0;
-
-	if (dumped_before)
-		return;
-
-	dumped_before = true;
-
-	printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
-			state->stack_info.type, state->stack_info.next_sp,
-			state->stack_mask, state->graph_idx);
-
-	for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
-	     sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
-		if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
-			break;
-
-		for (; sp < stack_info.end; sp++) {
-
-			word = READ_ONCE_NOCHECK(*sp);
-
-			prev_zero = zero;
-			zero = word == 0;
-
-			if (zero) {
-				if (!prev_zero)
-					printk_deferred("%p: %0*x ...\n",
-							sp, BITS_PER_LONG/4, 0);
-				continue;
-			}
-
-			printk_deferred("%p: %0*lx (%pB)\n",
-					sp, BITS_PER_LONG/4, word, (void *)word);
-		}
-	}
-}
-
 static bool in_entry_code(unsigned long ip)
 {
 	char *addr = (char *)ip;
@@ -244,7 +202,6 @@ static bool update_stack_state(struct unwind_state *state,
 						  addr, addr_p);
 	}
 
-	/* Save the original stack pointer for unwind_dump(): */
 	if (!state->orig_sp)
 		state->orig_sp = frame;
 
@@ -346,13 +303,17 @@ bool unwind_next_frame(struct unwind_state *state)
 			"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
 			state->regs, state->task->comm,
 			state->task->pid, next_bp);
-		unwind_dump(state);
+
+		if (unwind_debug)
+			unwind_dump(state);
 	} else {
 		printk_deferred_once(KERN_WARNING
 			"WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
 			state->bp, state->task->comm,
 			state->task->pid, next_bp);
-		unwind_dump(state);
+
+		if (unwind_debug)
+			unwind_dump(state);
 	}
 the_end:
 	state->stack_info.type = STACK_TYPE_UNKNOWN;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 73f800100066..38265eac41dd 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -13,8 +13,11 @@
 
 #define orc_warn_current(args...)					\
 ({									\
-	if (state->task == current)					\
+	if (state->task == current) {					\
 		orc_warn(args);						\
+		if (unwind_debug)					\
+			unwind_dump(state);				\
+	}								\
 })
 
 extern int __start_orc_unwind_ip[];
Ivan Babrou Feb. 3, 2021, 10:41 p.m. UTC | #5
On Wed, Feb 3, 2021 at 11:05 AM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
> On Wed, Feb 03, 2021 at 09:46:55AM -0800, Ivan Babrou wrote:
> > > Can you pretty please not line-wrap console output? It's unreadable.
> >
> > GMail doesn't make it easy, I'll send a link to a pastebin next time.
> > Let me know if you'd like me to regenerate the decoded stack.
> >
> > > > edfd9b7838ba5e47f19ad8466d0565aba5c59bf0 is the first bad commit
> > > > commit edfd9b7838ba5e47f19ad8466d0565aba5c59bf0
> > >
> > > Not sure what tree you're on, but that's not the upstream commit.
> >
> > I mentioned that it's a rebased core-static_call-2020-10-12 tag and
> > added a link to the upstream hash right below.
> >
> > > > Author: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > Date:   Tue Aug 18 15:57:52 2020 +0200
> > > >
> > > >     tracepoint: Optimize using static_call()
> > > >
> > >
> > > There's a known issue with that patch, can you try:
> > >
> > >   http://lkml.kernel.org/r/20210202220121.435051654@goodmis.org
> >
> > I've tried it on top of core-static_call-2020-10-12 tag rebased on top
> > of v5.9 (to make it reproducible), and the patch did not help. Do I
> > need to apply the whole series or something else?
>
> Can you recreate with this patch, and add "unwind_debug" to the cmdline?
> It will spit out a bunch of stack data.

Here's the three I'm building:

* https://github.com/bobrik/linux/tree/ivan/static-call-5.9

It contains:

* v5.9 tag as the base
* static_call-2020-10-12 tag
* dm-crypt patches to reproduce the issue with KASAN
* x86/unwind: Add 'unwind_debug' cmdline option
* tracepoint: Fix race between tracing and removing tracepoint

The very same issue can be reproduced on 5.10.11 with no patches,
but I'm going with 5.9, since it boils down to static call changes.

Here's the decoded stack from the kernel with unwind debug enabled:

* https://gist.github.com/bobrik/ed052ac0ae44c880f3170299ad4af56b

See my first email for the exact commands that trigger this.
Josh Poimboeuf Feb. 3, 2021, 11:27 p.m. UTC | #6
On Wed, Feb 03, 2021 at 02:41:53PM -0800, Ivan Babrou wrote:
> On Wed, Feb 3, 2021 at 11:05 AM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> >
> > On Wed, Feb 03, 2021 at 09:46:55AM -0800, Ivan Babrou wrote:
> > > > Can you pretty please not line-wrap console output? It's unreadable.
> > >
> > > GMail doesn't make it easy, I'll send a link to a pastebin next time.
> > > Let me know if you'd like me to regenerate the decoded stack.
> > >
> > > > > edfd9b7838ba5e47f19ad8466d0565aba5c59bf0 is the first bad commit
> > > > > commit edfd9b7838ba5e47f19ad8466d0565aba5c59bf0
> > > >
> > > > Not sure what tree you're on, but that's not the upstream commit.
> > >
> > > I mentioned that it's a rebased core-static_call-2020-10-12 tag and
> > > added a link to the upstream hash right below.
> > >
> > > > > Author: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > > Date:   Tue Aug 18 15:57:52 2020 +0200
> > > > >
> > > > >     tracepoint: Optimize using static_call()
> > > > >
> > > >
> > > > There's a known issue with that patch, can you try:
> > > >
> > > >   http://lkml.kernel.org/r/20210202220121.435051654@goodmis.org
> > >
> > > I've tried it on top of core-static_call-2020-10-12 tag rebased on top
> > > of v5.9 (to make it reproducible), and the patch did not help. Do I
> > > need to apply the whole series or something else?
> >
> > Can you recreate with this patch, and add "unwind_debug" to the cmdline?
> > It will spit out a bunch of stack data.
> 
> Here's the three I'm building:
> 
> * https://github.com/bobrik/linux/tree/ivan/static-call-5.9
> 
> It contains:
> 
> * v5.9 tag as the base
> * static_call-2020-10-12 tag
> * dm-crypt patches to reproduce the issue with KASAN
> * x86/unwind: Add 'unwind_debug' cmdline option
> * tracepoint: Fix race between tracing and removing tracepoint
> 
> The very same issue can be reproduced on 5.10.11 with no patches,
> but I'm going with 5.9, since it boils down to static call changes.
> 
> Here's the decoded stack from the kernel with unwind debug enabled:
> 
> * https://gist.github.com/bobrik/ed052ac0ae44c880f3170299ad4af56b
> 
> See my first email for the exact commands that trigger this.

Thanks.  Do you happen to have the original dmesg, before running it
through the post-processing script?


I assume you're using decode_stacktrace.sh?  It could use some
improvement, it's stripping the function offset.

Also spaces are getting inserted in odd places, messing the alignment.

[  137.291837][    C0] ffff88809c409858: d7c4f3ce817a1700 (0xd7c4f3ce817a1700)
[  137.291837][    C0] ffff88809c409860: 0000000000000000 (0x0)
[  137.291839][    C0] ffff88809c409868: 00000000ffffffff (0xffffffff)
[ 137.291841][ C0] ffff88809c409870: ffffffffa4f01a52 unwind_next_frame (arch/x86/kernel/unwind_orc.c:380 arch/x86/kernel/unwind_orc.c:553)
[ 137.291843][ C0] ffff88809c409878: ffffffffa4f01a52 unwind_next_frame (arch/x86/kernel/unwind_orc.c:380 arch/x86/kernel/unwind_orc.c:553)
[  137.291844][    C0] ffff88809c409880: ffff88809c409ac8 (0xffff88809c409ac8)
[  137.291845][    C0] ffff88809c409888: 0000000000000086 (0x86)
Ivan Babrou Feb. 3, 2021, 11:30 p.m. UTC | #7
On Wed, Feb 3, 2021 at 3:28 PM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
> On Wed, Feb 03, 2021 at 02:41:53PM -0800, Ivan Babrou wrote:
> > On Wed, Feb 3, 2021 at 11:05 AM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > >
> > > On Wed, Feb 03, 2021 at 09:46:55AM -0800, Ivan Babrou wrote:
> > > > > Can you pretty please not line-wrap console output? It's unreadable.
> > > >
> > > > GMail doesn't make it easy, I'll send a link to a pastebin next time.
> > > > Let me know if you'd like me to regenerate the decoded stack.
> > > >
> > > > > > edfd9b7838ba5e47f19ad8466d0565aba5c59bf0 is the first bad commit
> > > > > > commit edfd9b7838ba5e47f19ad8466d0565aba5c59bf0
> > > > >
> > > > > Not sure what tree you're on, but that's not the upstream commit.
> > > >
> > > > I mentioned that it's a rebased core-static_call-2020-10-12 tag and
> > > > added a link to the upstream hash right below.
> > > >
> > > > > > Author: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > > > Date:   Tue Aug 18 15:57:52 2020 +0200
> > > > > >
> > > > > >     tracepoint: Optimize using static_call()
> > > > > >
> > > > >
> > > > > There's a known issue with that patch, can you try:
> > > > >
> > > > >   http://lkml.kernel.org/r/20210202220121.435051654@goodmis.org
> > > >
> > > > I've tried it on top of core-static_call-2020-10-12 tag rebased on top
> > > > of v5.9 (to make it reproducible), and the patch did not help. Do I
> > > > need to apply the whole series or something else?
> > >
> > > Can you recreate with this patch, and add "unwind_debug" to the cmdline?
> > > It will spit out a bunch of stack data.
> >
> > Here's the three I'm building:
> >
> > * https://github.com/bobrik/linux/tree/ivan/static-call-5.9
> >
> > It contains:
> >
> > * v5.9 tag as the base
> > * static_call-2020-10-12 tag
> > * dm-crypt patches to reproduce the issue with KASAN
> > * x86/unwind: Add 'unwind_debug' cmdline option
> > * tracepoint: Fix race between tracing and removing tracepoint
> >
> > The very same issue can be reproduced on 5.10.11 with no patches,
> > but I'm going with 5.9, since it boils down to static call changes.
> >
> > Here's the decoded stack from the kernel with unwind debug enabled:
> >
> > * https://gist.github.com/bobrik/ed052ac0ae44c880f3170299ad4af56b
> >
> > See my first email for the exact commands that trigger this.
>
> Thanks.  Do you happen to have the original dmesg, before running it
> through the post-processing script?

Yes, here it is:

* https://gist.github.com/bobrik/8c13e6a02555fb21cadabb74cdd6f9ab

> I assume you're using decode_stacktrace.sh?  It could use some
> improvement, it's stripping the function offset.
>
> Also spaces are getting inserted in odd places, messing the alignment.
>
> [  137.291837][    C0] ffff88809c409858: d7c4f3ce817a1700 (0xd7c4f3ce817a1700)
> [  137.291837][    C0] ffff88809c409860: 0000000000000000 (0x0)
> [  137.291839][    C0] ffff88809c409868: 00000000ffffffff (0xffffffff)
> [ 137.291841][ C0] ffff88809c409870: ffffffffa4f01a52 unwind_next_frame (arch/x86/kernel/unwind_orc.c:380 arch/x86/kernel/unwind_orc.c:553)
> [ 137.291843][ C0] ffff88809c409878: ffffffffa4f01a52 unwind_next_frame (arch/x86/kernel/unwind_orc.c:380 arch/x86/kernel/unwind_orc.c:553)
> [  137.291844][    C0] ffff88809c409880: ffff88809c409ac8 (0xffff88809c409ac8)
> [  137.291845][    C0] ffff88809c409888: 0000000000000086 (0x86)
>
> --
> Josh
>
Josh Poimboeuf Feb. 4, 2021, 12:17 a.m. UTC | #8
On Wed, Feb 03, 2021 at 03:30:35PM -0800, Ivan Babrou wrote:
> > > > Can you recreate with this patch, and add "unwind_debug" to the cmdline?
> > > > It will spit out a bunch of stack data.
> > >
> > > Here's the three I'm building:
> > >
> > > * https://github.com/bobrik/linux/tree/ivan/static-call-5.9
> > >
> > > It contains:
> > >
> > > * v5.9 tag as the base
> > > * static_call-2020-10-12 tag
> > > * dm-crypt patches to reproduce the issue with KASAN
> > > * x86/unwind: Add 'unwind_debug' cmdline option
> > > * tracepoint: Fix race between tracing and removing tracepoint
> > >
> > > The very same issue can be reproduced on 5.10.11 with no patches,
> > > but I'm going with 5.9, since it boils down to static call changes.
> > >
> > > Here's the decoded stack from the kernel with unwind debug enabled:
> > >
> > > * https://gist.github.com/bobrik/ed052ac0ae44c880f3170299ad4af56b
> > >
> > > See my first email for the exact commands that trigger this.
> >
> > Thanks.  Do you happen to have the original dmesg, before running it
> > through the post-processing script?
> 
> Yes, here it is:
> 
> * https://gist.github.com/bobrik/8c13e6a02555fb21cadabb74cdd6f9ab

It appears the unwinder is getting lost in crypto code.  No idea what
this has to do with static calls though.  Or maybe you're seeing
multiple issues.

Does this fix it?


diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a31de0c6ccde..36c55341137c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,7 +2,14 @@
 #
 # x86 crypto algorithms
 
-OBJECT_FILES_NON_STANDARD := y
+OBJECT_FILES_NON_STANDARD_sha256-avx2-asm.o		:= y
+OBJECT_FILES_NON_STANDARD_sha512-ssse3-asm.o		:= y
+OBJECT_FILES_NON_STANDARD_sha512-avx-asm.o		:= y
+OBJECT_FILES_NON_STANDARD_sha512-avx2-asm.o		:= y
+OBJECT_FILES_NON_STANDARD_crc32c-pcl-intel-asm_64.o	:= y
+OBJECT_FILES_NON_STANDARD_camellia-aesni-avx2-asm_64.o	:= y
+OBJECT_FILES_NON_STANDARD_sha1_avx2_x86_64_asm.o	:= y
+OBJECT_FILES_NON_STANDARD_sha1_ni_asm.o			:= y
 
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 5fee47956f3b..59c36b88954f 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -237,8 +237,8 @@ define_reg j %j
 .noaltmacro
 .endm
 
-# need to push 4 registers into stack to maintain
-STACK_OFFSET = 8*4
+# need to push 5 registers into stack to maintain
+STACK_OFFSET = 8*5
 
 TMP1 =   16*0    # Temporary storage for AAD
 TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
@@ -257,6 +257,8 @@ VARIABLE_OFFSET = 16*8
 
 .macro FUNC_SAVE
         #the number of pushes must equal STACK_OFFSET
+	push	%rbp
+	mov	%rsp, %rbp
         push    %r12
         push    %r13
         push    %r14
@@ -271,12 +273,14 @@ VARIABLE_OFFSET = 16*8
 .endm
 
 .macro FUNC_RESTORE
+        add     $VARIABLE_OFFSET, %rsp
         mov     %r14, %rsp
 
         pop     %r15
         pop     %r14
         pop     %r13
         pop     %r12
+	pop	%rbp
 .endm
 
 # Encryption of a single block
Ivan Babrou Feb. 4, 2021, 12:52 a.m. UTC | #9
On Wed, Feb 3, 2021 at 4:17 PM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
> On Wed, Feb 03, 2021 at 03:30:35PM -0800, Ivan Babrou wrote:
> > > > > Can you recreate with this patch, and add "unwind_debug" to the cmdline?
> > > > > It will spit out a bunch of stack data.
> > > >
> > > > Here's the three I'm building:
> > > >
> > > > * https://github.com/bobrik/linux/tree/ivan/static-call-5.9
> > > >
> > > > It contains:
> > > >
> > > > * v5.9 tag as the base
> > > > * static_call-2020-10-12 tag
> > > > * dm-crypt patches to reproduce the issue with KASAN
> > > > * x86/unwind: Add 'unwind_debug' cmdline option
> > > > * tracepoint: Fix race between tracing and removing tracepoint
> > > >
> > > > The very same issue can be reproduced on 5.10.11 with no patches,
> > > > but I'm going with 5.9, since it boils down to static call changes.
> > > >
> > > > Here's the decoded stack from the kernel with unwind debug enabled:
> > > >
> > > > * https://gist.github.com/bobrik/ed052ac0ae44c880f3170299ad4af56b
> > > >
> > > > See my first email for the exact commands that trigger this.
> > >
> > > Thanks.  Do you happen to have the original dmesg, before running it
> > > through the post-processing script?
> >
> > Yes, here it is:
> >
> > * https://gist.github.com/bobrik/8c13e6a02555fb21cadabb74cdd6f9ab
>
> It appears the unwinder is getting lost in crypto code.  No idea what
> this has to do with static calls though.  Or maybe you're seeing
> multiple issues.
>
> Does this fix it?

It does for the dm-crypt case! But so does the following commit in
5.11 (and 5.10.12):

* https://github.com/torvalds/linux/commit/ce8f86ee94?w=1

The reason I stuck to dm-crypt reproduction is that it reproduces reliably.

We also have the following stack that doesn't touch any crypto:

* https://gist.github.com/bobrik/40e2559add2f0b26ae39da30dc451f1e

I cannot reproduce this one, and it took 2 days of uptime for it to
happen. Is there anything I can do to help diagnose it?

My goal is to enable multishot KASAN in our pre-production
environment, but currently it sometimes starves TX queues on the NIC
due to multiple reports in a row in an interrupt about
unwind_next_frame, which disables network interface, which is not
something we can tolerate.
Josh Poimboeuf Feb. 4, 2021, 2:37 a.m. UTC | #10
On Wed, Feb 03, 2021 at 04:52:42PM -0800, Ivan Babrou wrote:
> We also have the following stack that doesn't touch any crypto:
> 
> * https://gist.github.com/bobrik/40e2559add2f0b26ae39da30dc451f1e

Can you also run this through decode_stacktrace.sh?

Both are useful (until I submit a fix for decode_stacktrace.sh).

> I cannot reproduce this one, and it took 2 days of uptime for it to
> happen. Is there anything I can do to help diagnose it?

Can you run with the same unwind_debug patch+cmdline when you try to
recreate this one?  In the meantime I'll look at the available data.
Steven Rostedt Feb. 4, 2021, 2:44 a.m. UTC | #11
On Tue, 2 Feb 2021 19:09:44 -0800
Ivan Babrou <ivan@cloudflare.com> wrote:

> On Thu, Jan 28, 2021 at 7:35 PM Ivan Babrou <ivan@cloudflare.com> wrote:
> >
> > Hello,
> >
> > We've noticed the following regression in Linux 5.10 branch:
> >
> > [  128.367231][    C0]
> > ==================================================================
> > [  128.368523][    C0] BUG: KASAN: stack-out-of-bounds in
> > unwind_next_frame (arch/x86/kernel/unwind_orc.c:371

The bug is a stack-out-of-bounds error in unwind_orc.c, right?

> > arch/x86/kernel/unwind_orc.c:544)
> > [  128.369744][    C0] Read of size 8 at addr ffff88802fceede0 by task
> > kworker/u2:2/591
> > [  128.370916][    C0]
> > [  128.371269][    C0] CPU: 0 PID: 591 Comm: kworker/u2:2 Not tainted
> > 5.10.11-cloudflare-kasan-2021.1.15 #1
> > [  128.372626][    C0] Hardware name: QEMU Standard PC (i440FX + PIIX,
> > 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
> > [  128.374346][    C0] Workqueue: writeback wb_workfn (flush-254:0)
> > [  128.375275][    C0] Call Trace:
> > [  128.375763][    C0]  <IRQ>
> > [  128.376221][    C0]  dump_stack+0x7d/0xa3
> > [  128.376843][    C0]  print_address_description.constprop.0+0x1c/0x210
[ snip ? results ]
> > (arch/x86/kernel/unwind_orc.c:371 arch/x86/kernel/unwind_orc.c:544)
[ snip ]
> > [  128.381736][    C0]  kasan_report.cold+0x1f/0x37
[ snip ]
> > [  128.383192][    C0]  unwind_next_frame+0x1df5/0x2650
[ snip ]
> > [  128.391550][    C0]  arch_stack_walk+0x8d/0xf0
[ snip ]
> > [  128.392807][    C0]  stack_trace_save+0x96/0xd0
[ snip ]
> > arch/x86/include/asm/irq_stack.h:77 arch/x86/kernel/irq_64.c:77)
[ snip ]
> > [  128.399759][    C0]  kasan_save_stack+0x20/0x50
[ snip ]
> > [  128.427691][    C0]  kasan_set_track+0x1c/0x30
> > [  128.428366][    C0]  kasan_set_free_info+0x1b/0x30
> > [  128.429113][    C0]  __kasan_slab_free+0x110/0x150
> > [  128.429838][    C0]  slab_free_freelist_hook+0x66/0x120
> > [  128.430628][    C0]  kfree+0xbf/0x4d0

[ snip the rest ]

> > [  128.441287][    C0] RIP: 0010:skcipher_walk_next
> > (crypto/skcipher.c:322 crypto/skcipher.c:384)

Why do we have an RIP in skcipher_walk_next, if its the unwinder that
had a bug? Or are they related?

Or did skcipher_walk_next trigger something in KASAN which did a stack
walk via the unwinder, and that caused another issue?

Looking at the unwinder code in question, we have:

static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
                             unsigned long *ip, unsigned long *sp)
{
        struct pt_regs *regs = (struct pt_regs *)addr;

        /* x86-32 support will be more complicated due to the &regs->sp hack */
        BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));

        if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
                return false;

        *ip = regs->ip;
        *sp = regs->sp; <- pointer to here
        return true;
}

and the caller of the above static function:

        case UNWIND_HINT_TYPE_REGS:
                if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn_current("can't access registers at %pB\n",
                                         (void *)orig_ip);
                        goto err;
                }


Could it possibly be that there's some magic canary on the stack that
causes KASAN to trigger if you read it? For example, there's this in
the stack tracer:

kernel/trace/trace_stack.c: check_stack()

        while (i < stack_trace_nr_entries) {
                int found = 0;

                stack_trace_index[x] = this_size;
                p = start;

                for (; p < top && i < stack_trace_nr_entries; p++) {
                        /*
                         * The READ_ONCE_NOCHECK is used to let KASAN know that
                         * this is not a stack-out-of-bounds error.
                         */
                        if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) {
                                stack_dump_trace[x] = stack_dump_trace[i++];
                                this_size = stack_trace_index[x++] =
                                        (top - p) * sizeof(unsigned long);
                                found = 1;


That is because I read the entire stack frame looking for values, and I
know where the top of the stack is, and will not go past it. But it too
triggered a stack-out-of-bounds error, which required the above
READ_ONCE_NOCHECK() to quiet KASAN. Not to mention there's already some
READ_ONCE_NOCHECK() calls in the unwinder. Maybe this too is required?

Would this work?

diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 73f800100066..22eaf3683c2a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
 	if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
 		return false;
 
-	*ip = regs->ip;
-	*sp = regs->sp;
+	*ip = READ_ONCE_NOCHECK(regs->ip);
+	*sp = READ_ONCE_NOCHECK(regs->sp);
 	return true;
 }
 
-- Steve
Josh Poimboeuf Feb. 4, 2021, 3:09 a.m. UTC | #12
On Wed, Feb 03, 2021 at 09:44:48PM -0500, Steven Rostedt wrote:
> > > [  128.441287][    C0] RIP: 0010:skcipher_walk_next
> > > (crypto/skcipher.c:322 crypto/skcipher.c:384)
> 
> Why do we have an RIP in skcipher_walk_next, if its the unwinder that
> had a bug? Or are they related?
> 
> Or did skcipher_walk_next trigger something in KASAN which did a stack
> walk via the unwinder, and that caused another issue?

It was interrupted by an IRQ, which then called kfree(), which then
called kasan_save_stack(), which then called the unwinder, which then
read "out-of-bounds" between stack frames.

In this case it was because of some crypto code missing ORC annotations.

> Looking at the unwinder code in question, we have:
> 
> static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
>                              unsigned long *ip, unsigned long *sp)
> {
>         struct pt_regs *regs = (struct pt_regs *)addr;
> 
>         /* x86-32 support will be more complicated due to the &regs->sp hack */
>         BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
> 
>         if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
>                 return false;
> 
>         *ip = regs->ip;
>         *sp = regs->sp; <- pointer to here
>         return true;
> }
> 
> and the caller of the above static function:
> 
>         case UNWIND_HINT_TYPE_REGS:
>                 if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
>                         orc_warn_current("can't access registers at %pB\n",
>                                          (void *)orig_ip);
>                         goto err;
>                 }
> 
> 
> Could it possibly be that there's some magic canary on the stack that
> causes KASAN to trigger if you read it?

Right, the unwinder isn't allowed to read between stack frames.

In fact, you read my mind, I was looking at the other warning in network
code:

  [160676.598929][    C4]  asm_common_interrupt+0x1e/0x40
  [160676.608966][    C4] RIP: 0010:0xffffffffc17d814c
  [160676.618812][    C4] Code: 8b 4c 24 40 4c 8b 44 24 48 48 8b 7c 24 70 48 8b 74 24 68 48 8b 54 24 60 48 8b 4c 24 58 48 8b 44 24 50 48 81 c4 a8 00 00 00 9d <c3> 20 27 af 8f ff ff ff ff 00 00 00 00 00 00 00 00 00 00 00 00 00
  [160676.649371][    C4] RSP: 0018:ffff8893dfd4f620 EFLAGS: 00000282
  [160676.661073][    C4] RAX: 0000000000000000 RBX: ffff8881be9c9c80 RCX: 0000000000000000
  [160676.674788][    C4] RDX: dffffc0000000000 RSI: 000000000000000b RDI: ffff8881be9c9c80
  [160676.688508][    C4] RBP: ffff8881be9c9ce0 R08: 0000000000000000 R09: ffff8881908c4c97
  [160676.702249][    C4] R10: ffffed1032118992 R11: ffff88818a4ce68c R12: ffff8881be9c9eea
  [160676.716000][    C4] R13: ffff8881be9c9c92 R14: ffff8880063ba5ac R15: ffff8880063ba5a8
  [160676.729895][    C4]  ? tcp_set_state+0x5/0x620
  [160676.740426][    C4]  ? tcp_fin+0xeb/0x5a0
  [160676.750287][    C4]  ? tcp_data_queue+0x1e78/0x4ce0
  [160676.761089][    C4]  ? tcp_urg+0x76/0xc50

This line gives a big clue:

  [160676.608966][    C4] RIP: 0010:0xffffffffc17d814c

That address, without a function name, most likely means that it was
running in some generated code (mostly likely BPF) when it got
interrupted.

Right now, the ORC unwinder tries to fall back to frame pointers when it
encounters generated code:

	orc = orc_find(state->signal ? state->ip : state->ip - 1);
	if (!orc)
		/*
		 * As a fallback, try to assume this code uses a frame pointer.
		 * This is useful for generated code, like BPF, which ORC
		 * doesn't know about.  This is just a guess, so the rest of
		 * the unwind is no longer considered reliable.
		 */
		orc = &orc_fp_entry;
		state->error = true;
	}

Because the ORC unwinder is guessing from that point onward, it's
possible for it to read the KASAN stack redzone, if the generated code
hasn't set up frame pointers.  So the best fix may be for the unwinder
to just always bypass KASAN when reading the stack.

The unwinder has a mechanism for detecting and warning about
out-of-bounds, and KASAN is short-circuiting that.

This should hopefully get rid of *all* the KASAN unwinder warnings, both
crypto and networking.

diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 040194d079b6..1f69a23a4715 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -376,8 +376,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
 	if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
 		return false;
 
-	*ip = regs->ip;
-	*sp = regs->sp;
+	*ip = READ_ONCE_NOCHECK(regs->ip);
+	*sp = READ_ONCE_NOCHECK(regs->sp);
 	return true;
 }
 
@@ -389,8 +389,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr
 	if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
 		return false;
 
-	*ip = regs->ip;
-	*sp = regs->sp;
+	*ip = READ_ONCE_NOCHECK(regs->ip);
+	*sp = READ_ONCE_NOCHECK(regs->sp);
 	return true;
 }
 
@@ -411,12 +411,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off,
 		return false;
 
 	if (state->full_regs) {
-		*val = ((unsigned long *)state->regs)[reg];
+		*val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
 		return true;
 	}
 
 	if (state->prev_regs) {
-		*val = ((unsigned long *)state->prev_regs)[reg];
+		*val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
 		return true;
 	}
Peter Zijlstra Feb. 4, 2021, 9:22 a.m. UTC | #13
On Wed, Feb 03, 2021 at 09:46:55AM -0800, Ivan Babrou wrote:
> > Can you pretty please not line-wrap console output? It's unreadable.
> 
> GMail doesn't make it easy, I'll send a link to a pastebin next time.
> Let me know if you'd like me to regenerate the decoded stack.

Not my problem that you can't use email proper. Links go in the
bitbucket. Either its in the email or it don't exist.
Ivan Babrou Feb. 4, 2021, 6:41 p.m. UTC | #14
On Wed, Feb 3, 2021 at 7:10 PM Josh Poimboeuf <jpoimboe@redhat.com> wrote:

> This line gives a big clue:
>
>   [160676.608966][    C4] RIP: 0010:0xffffffffc17d814c
>
> That address, without a function name, most likely means that it was
> running in some generated code (mostly likely BPF) when it got
> interrupted.

We do have eBPF/XDP in our environment.

> Right now, the ORC unwinder tries to fall back to frame pointers when it
> encounters generated code:
>
>         orc = orc_find(state->signal ? state->ip : state->ip - 1);
>         if (!orc)
>                 /*
>                  * As a fallback, try to assume this code uses a frame pointer.
>                  * This is useful for generated code, like BPF, which ORC
>                  * doesn't know about.  This is just a guess, so the rest of
>                  * the unwind is no longer considered reliable.
>                  */
>                 orc = &orc_fp_entry;
>                 state->error = true;
>         }
>
> Because the ORC unwinder is guessing from that point onward, it's
> possible for it to read the KASAN stack redzone, if the generated code
> hasn't set up frame pointers.  So the best fix may be for the unwinder
> to just always bypass KASAN when reading the stack.
>
> The unwinder has a mechanism for detecting and warning about
> out-of-bounds, and KASAN is short-circuiting that.
>
> This should hopefully get rid of *all* the KASAN unwinder warnings, both
> crypto and networking.

It definitely worked on my dm-crypt case, and I've tried it without
your previous AVX related patch. I will apply it to our tree and
deploy to the staging KASAN environment to see how it fares with
respect to networking stacks. Feel free to ping me if I don't get back
to you with the results on Monday.

Thanks for looking into this!
Ivan Babrou Feb. 4, 2021, 7:51 p.m. UTC | #15
On Wed, Feb 3, 2021 at 4:17 PM Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>
> On Wed, Feb 03, 2021 at 03:30:35PM -0800, Ivan Babrou wrote:
> > > > > Can you recreate with this patch, and add "unwind_debug" to the cmdline?
> > > > > It will spit out a bunch of stack data.
> > > >
> > > > Here's the three I'm building:
> > > >
> > > > * https://github.com/bobrik/linux/tree/ivan/static-call-5.9
> > > >
> > > > It contains:
> > > >
> > > > * v5.9 tag as the base
> > > > * static_call-2020-10-12 tag
> > > > * dm-crypt patches to reproduce the issue with KASAN
> > > > * x86/unwind: Add 'unwind_debug' cmdline option
> > > > * tracepoint: Fix race between tracing and removing tracepoint
> > > >
> > > > The very same issue can be reproduced on 5.10.11 with no patches,
> > > > but I'm going with 5.9, since it boils down to static call changes.
> > > >
> > > > Here's the decoded stack from the kernel with unwind debug enabled:
> > > >
> > > > * https://gist.github.com/bobrik/ed052ac0ae44c880f3170299ad4af56b
> > > >
> > > > See my first email for the exact commands that trigger this.
> > >
> > > Thanks.  Do you happen to have the original dmesg, before running it
> > > through the post-processing script?
> >
> > Yes, here it is:
> >
> > * https://gist.github.com/bobrik/8c13e6a02555fb21cadabb74cdd6f9ab
>
> It appears the unwinder is getting lost in crypto code.  No idea what
> this has to do with static calls though.  Or maybe you're seeing
> multiple issues.
>
> Does this fix it?
>
>
> diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
> index a31de0c6ccde..36c55341137c 100644
> --- a/arch/x86/crypto/Makefile
> +++ b/arch/x86/crypto/Makefile
> @@ -2,7 +2,14 @@
>  #
>  # x86 crypto algorithms
>
> -OBJECT_FILES_NON_STANDARD := y
> +OBJECT_FILES_NON_STANDARD_sha256-avx2-asm.o            := y
> +OBJECT_FILES_NON_STANDARD_sha512-ssse3-asm.o           := y
> +OBJECT_FILES_NON_STANDARD_sha512-avx-asm.o             := y
> +OBJECT_FILES_NON_STANDARD_sha512-avx2-asm.o            := y
> +OBJECT_FILES_NON_STANDARD_crc32c-pcl-intel-asm_64.o    := y
> +OBJECT_FILES_NON_STANDARD_camellia-aesni-avx2-asm_64.o := y
> +OBJECT_FILES_NON_STANDARD_sha1_avx2_x86_64_asm.o       := y
> +OBJECT_FILES_NON_STANDARD_sha1_ni_asm.o                        := y
>
>  obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
>
> diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
> index 5fee47956f3b..59c36b88954f 100644
> --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
> +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
> @@ -237,8 +237,8 @@ define_reg j %j
>  .noaltmacro
>  .endm
>
> -# need to push 4 registers into stack to maintain
> -STACK_OFFSET = 8*4
> +# need to push 5 registers into stack to maintain
> +STACK_OFFSET = 8*5
>
>  TMP1 =   16*0    # Temporary storage for AAD
>  TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
> @@ -257,6 +257,8 @@ VARIABLE_OFFSET = 16*8
>
>  .macro FUNC_SAVE
>          #the number of pushes must equal STACK_OFFSET
> +       push    %rbp
> +       mov     %rsp, %rbp
>          push    %r12
>          push    %r13
>          push    %r14
> @@ -271,12 +273,14 @@ VARIABLE_OFFSET = 16*8
>  .endm
>
>  .macro FUNC_RESTORE
> +        add     $VARIABLE_OFFSET, %rsp
>          mov     %r14, %rsp
>
>          pop     %r15
>          pop     %r14
>          pop     %r13
>          pop     %r12
> +       pop     %rbp
>  .endm
>
>  # Encryption of a single block
>

This patch seems to fix the following warning:

[  147.995699][    C0] WARNING: stack going in the wrong direction? at
glue_xts_req_128bit+0x21f/0x6f0 [glue_helper]

Or at least I cannot see it anymore when combined with your other
patch, not sure if it did the trick by itself.

This sounds like a good reason to send them both.
Josh Poimboeuf Feb. 4, 2021, 8:22 p.m. UTC | #16
On Thu, Feb 04, 2021 at 11:51:44AM -0800, Ivan Babrou wrote:
> >  .macro FUNC_SAVE
> >          #the number of pushes must equal STACK_OFFSET
> > +       push    %rbp
> > +       mov     %rsp, %rbp
> >          push    %r12
> >          push    %r13
> >          push    %r14
> > @@ -271,12 +273,14 @@ VARIABLE_OFFSET = 16*8
> >  .endm
> >
> >  .macro FUNC_RESTORE
> > +        add     $VARIABLE_OFFSET, %rsp
> >          mov     %r14, %rsp
> >
> >          pop     %r15
> >          pop     %r14
> >          pop     %r13
> >          pop     %r12
> > +       pop     %rbp
> >  .endm
> >
> >  # Encryption of a single block
> >
> 
> This patch seems to fix the following warning:
> 
> [  147.995699][    C0] WARNING: stack going in the wrong direction? at
> glue_xts_req_128bit+0x21f/0x6f0 [glue_helper]
> 
> Or at least I cannot see it anymore when combined with your other
> patch, not sure if it did the trick by itself.
> 
> This sounds like a good reason to send them both.

Ok, that's what I expected.

The other patch fixed the unwinder failure mode to be the above
(harmless) unwinder warning, instead of a disruptive KASAN failure.

This patch fixes the specific underlying crypto unwinding metadata
issue.

I'll definitely be sending both fixes.  The improved failure mode patch
will come first because it's more urgent and lower risk.
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14b9e83ff9da..b5961d530929 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2871,7 +2871,8 @@  __rmqueue(struct zone *zone, unsigned int order,
int migratetype,
                        goto retry;
        }

-       trace_mm_page_alloc_zone_locked(page, order, migratetype);
+       if (page)
+               trace_mm_page_alloc_zone_locked(page, order, migratetype);
        return page;
 }