diff mbox series

bpf, cpumap: Fix use after free of bpf_cpu_map_entry in cpu_map_enqueue

Message ID 20240726180157.1065502-2-radoslaw.zielonek@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, cpumap: Fix use after free of bpf_cpu_map_entry in cpu_map_enqueue | expand

Checks

Context Check Description
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 42 this patch: 42
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 16 of 16 maintainers
netdev/build_clang success Errors and warnings before: 43 this patch: 43
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 43 this patch: 43
netdev/checkpatch warning WARNING: The commit message has 'Call Trace:', perhaps it also needs a 'Fixes:' tag? WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Radoslaw Zielonek July 26, 2024, 6:01 p.m. UTC
When cpu_map has been redirected, first the pointer to the
bpf_cpu_map_entry has been copied, then freed, and read from the copy.
To fix it, this commit introduced the refcount cpu_map_parent during
redirections to prevent use after free.

syzbot reported:

[   61.581464][T11670] ==================================================================
[   61.583323][T11670] BUG: KASAN: slab-use-after-free in cpu_map_enqueue+0xba/0x370
[   61.585419][T11670] Read of size 8 at addr ffff888122d75208 by task syzbot-repro/11670
[   61.587541][T11670]
[   61.588237][T11670] CPU: 1 PID: 11670 Comm: syzbot-repro Not tainted 6.9.0-rc6-00053-g0106679839f7 #27
[   61.590542][T11670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.1 11/11/2019
[   61.592798][T11670] Call Trace:
[   61.593885][T11670]  <TASK>
[   61.594805][T11670]  dump_stack_lvl+0x241/0x360
[   61.595974][T11670]  ? tcp_gro_dev_warn+0x260/0x260
[   61.598242][T11670]  ? __wake_up_klogd+0xcc/0x100
[   61.599407][T11670]  ? panic+0x850/0x850
[   61.600516][T11670]  ? __virt_addr_valid+0x182/0x510
[   61.602073][T11670]  ? __virt_addr_valid+0x182/0x510
[   61.603496][T11670]  print_address_description+0x7b/0x360
[   61.605170][T11670]  print_report+0xfd/0x210
[   61.606370][T11670]  ? __virt_addr_valid+0x182/0x510
[   61.607925][T11670]  ? __virt_addr_valid+0x182/0x510
[   61.609577][T11670]  ? __virt_addr_valid+0x43d/0x510
[   61.610948][T11670]  ? __phys_addr+0xb9/0x170
[   61.612103][T11670]  ? cpu_map_enqueue+0xba/0x370
[   61.613448][T11670]  kasan_report+0x143/0x180
[   61.615000][T11670]  ? cpu_map_enqueue+0xba/0x370
[   61.616181][T11670]  cpu_map_enqueue+0xba/0x370
[   61.617620][T11670]  xdp_do_redirect+0x685/0xbf0
[   61.618787][T11670]  tun_xdp_act+0xe7/0x9e0
[   61.619856][T11670]  ? __tun_build_skb+0x2e0/0x2e0
[   61.621356][T11670]  tun_build_skb+0xac6/0x1140
[   61.622602][T11670]  ? tun_build_skb+0xb4/0x1140
[   61.623880][T11670]  ? tun_get_user+0x2760/0x2760
[   61.625341][T11670]  tun_get_user+0x7fa/0x2760
[   61.626532][T11670]  ? rcu_read_unlock+0xa0/0xa0
[   61.627725][T11670]  ? tun_get+0x1e/0x2f0
[   61.629147][T11670]  ? tun_get+0x1e/0x2f0
[   61.630265][T11670]  ? tun_get+0x27d/0x2f0
[   61.631486][T11670]  tun_chr_write_iter+0x111/0x1f0
[   61.632855][T11670]  vfs_write+0xa84/0xcb0
[   61.634185][T11670]  ? __lock_acquire+0x1f60/0x1f60
[   61.635501][T11670]  ? kernel_write+0x330/0x330
[   61.636757][T11670]  ? lockdep_hardirqs_on_prepare+0x43c/0x780
[   61.638445][T11670]  ? __fget_files+0x3ea/0x460
[   61.639448][T11670]  ? seqcount_lockdep_reader_access+0x157/0x220
[   61.641217][T11670]  ? __fdget_pos+0x19e/0x320
[   61.642426][T11670]  ksys_write+0x19f/0x2c0
[   61.643576][T11670]  ? __ia32_sys_read+0x90/0x90
[   61.644841][T11670]  ? ktime_get_coarse_real_ts64+0x10b/0x120
[   61.646549][T11670]  do_syscall_64+0xec/0x210
[   61.647832][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
[   61.649485][T11670] RIP: 0033:0x472a4f
[   61.650539][T11670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 c9 d8 02 00 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 0c d9 02 00 48
[   61.655476][T11670] RSP: 002b:00007f7a7a90f5c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[   61.657675][T11670] RAX: ffffffffffffffda RBX: 00007f7a7a911640 RCX: 0000000000472a4f
[   61.659658][T11670] RDX: 0000000000000066 RSI: 0000000020000440 RDI: 00000000000000c8
[   61.661980][T11670] RBP: 00007f7a7a90f620 R08: 0000000000000000 R09: 0000000100000000
[   61.663982][T11670] R10: 0000000100000000 R11: 0000000000000293 R12: 00007f7a7a911640
[   61.666425][T11670] R13: 000000000000006e R14: 000000000042f2f0 R15: 00007f7a7a8f1000
[   61.668443][T11670]  </TASK>
[   61.669233][T11670]
[   61.669754][T11670] Allocated by task 11643:
[   61.670855][T11670]  kasan_save_track+0x3f/0x70
[   61.672094][T11670]  __kasan_kmalloc+0x98/0xb0
[   61.673466][T11670]  __kmalloc_node+0x259/0x4f0
[   61.674687][T11670]  bpf_map_kmalloc_node+0xd3/0x1c0
[   61.676069][T11670]  cpu_map_update_elem+0x2f0/0x1000
[   61.677619][T11670]  bpf_map_update_value+0x1b2/0x540
[   61.679006][T11670]  map_update_elem+0x52f/0x6e0
[   61.680076][T11670]  __sys_bpf+0x7a9/0x850
[   61.681610][T11670]  __x64_sys_bpf+0x7c/0x90
[   61.682772][T11670]  do_syscall_64+0xec/0x210
[   61.683967][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
[   61.685648][T11670]
[   61.686282][T11670] Freed by task 1064:
[   61.687296][T11670]  kasan_save_track+0x3f/0x70
[   61.688498][T11670]  kasan_save_free_info+0x40/0x50
[   61.689786][T11670]  poison_slab_object+0xa6/0xe0
[   61.691059][T11670]  __kasan_slab_free+0x37/0x60
[   61.692336][T11670]  kfree+0x136/0x2f0
[   61.693549][T11670]  __cpu_map_entry_free+0x6f3/0x770
[   61.695004][T11670]  cpu_map_free+0xc0/0x180
[   61.696191][T11670]  bpf_map_free_deferred+0xe3/0x100
[   61.697703][T11670]  process_scheduled_works+0x9cb/0x14a0
[   61.699330][T11670]  worker_thread+0x85c/0xd50
[   61.700546][T11670]  kthread+0x2ef/0x390
[   61.701791][T11670]  ret_from_fork+0x4d/0x80
[   61.702942][T11670]  ret_from_fork_asm+0x11/0x20
[   61.704195][T11670]
[   61.704825][T11670] The buggy address belongs to the object at ffff888122d75200
[   61.704825][T11670]  which belongs to the cache kmalloc-cg-256 of size 256
[   61.708516][T11670] The buggy address is located 8 bytes inside of
[   61.708516][T11670]  freed 256-byte region [ffff888122d75200, ffff888122d75300)
[   61.712215][T11670]
[   61.712824][T11670] The buggy address belongs to the physical page:
[   61.714883][T11670] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x122d74
[   61.717300][T11670] head: order:1 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[   61.719037][T11670] memcg:ffff888120d85f01
[   61.720006][T11670] flags: 0x17ff00000000840(slab|head|node=0|zone=2|lastcpupid=0x7ff)
[   61.722181][T11670] page_type: 0xffffffff()
[   61.723318][T11670] raw: 017ff00000000840 ffff88810004dcc0 dead000000000122 0000000000000000
[   61.725650][T11670] raw: 0000000000000000 0000000080100010 00000001ffffffff ffff888120d85f01
[   61.727943][T11670] head: 017ff00000000840 ffff88810004dcc0 dead000000000122 0000000000000000
[   61.730237][T11670] head: 0000000000000000 0000000080100010 00000001ffffffff ffff888120d85f01
[   61.732671][T11670] head: 017ff00000000001 ffffea00048b5d01 dead000000000122 00000000ffffffff
[   61.735029][T11670] head: 0000000200000000 0000000000000000 00000000ffffffff 0000000000000000
[   61.737400][T11670] page dumped because: kasan: bad access detected
[   61.740100][T11670] page_owner tracks the page as allocated
[   61.743121][T11670] page last allocated via order 1, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 8343, tgid -2092279795 (syzbot-repro), ts 8343, free_ts 43505720198
[   61.754038][T11670]  post_alloc_hook+0x1e6/0x210
[   61.756046][T11670]  get_page_from_freelist+0x7d2/0x850
[   61.759460][T11670]  __alloc_pages+0x25e/0x580
[   61.761428][T11670]  alloc_slab_page+0x6b/0x1a0
[   61.764199][T11670]  allocate_slab+0x5d/0x200
[   61.766122][T11670]  ___slab_alloc+0xac5/0xf20
[   61.767195][T11670]  __kmalloc+0x2e0/0x4b0
[   61.769028][T11670]  fib_default_rule_add+0x4a/0x350
[   61.770394][T11670]  fib6_rules_net_init+0x42/0x100
[   61.771731][T11670]  ops_init+0x39d/0x670
[   61.773061][T11670]  setup_net+0x3bc/0xae0
[   61.774102][T11670]  copy_net_ns+0x399/0x5e0
[   61.775628][T11670]  create_new_namespaces+0x4de/0x8d0
[   61.776950][T11670]  unshare_nsproxy_namespaces+0x127/0x190
[   61.778352][T11670]  ksys_unshare+0x5e6/0xbf0
[   61.779741][T11670]  __x64_sys_unshare+0x38/0x40
[   61.781302][T11670] page last free pid 4619 tgid 4619 stack trace:
[   61.783542][T11670]  free_unref_page_prepare+0x72f/0x7c0
[   61.785018][T11670]  free_unref_page+0x37/0x3f0
[   61.786030][T11670]  __slab_free+0x351/0x3f0
[   61.786991][T11670]  qlist_free_all+0x60/0xd0
[   61.788827][T11670]  kasan_quarantine_reduce+0x15a/0x170
[   61.789951][T11670]  __kasan_slab_alloc+0x23/0x70
[   61.790999][T11670]  kmem_cache_alloc_node+0x193/0x390
[   61.792331][T11670]  kmalloc_reserve+0xa7/0x2a0
[   61.793345][T11670]  __alloc_skb+0x1ec/0x430
[   61.794435][T11670]  netlink_sendmsg+0x615/0xc80
[   61.796439][T11670]  __sock_sendmsg+0x21f/0x270
[   61.797467][T11670]  ____sys_sendmsg+0x540/0x860
[   61.798505][T11670]  __sys_sendmsg+0x2b7/0x3a0
[   61.799512][T11670]  do_syscall_64+0xec/0x210
[   61.800674][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
[   61.802021][T11670]
[   61.802526][T11670] Memory state around the buggy address:
[   61.803701][T11670]  ffff888122d75100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   61.805694][T11670]  ffff888122d75180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   61.808104][T11670] >ffff888122d75200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   61.809769][T11670]                       ^
[   61.810672][T11670]  ffff888122d75280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   61.812532][T11670]  ffff888122d75300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   61.814846][T11670] ==================================================================
[   61.816914][T11670] Kernel panic - not syncing: KASAN: panic_on_warn set ...
[   61.818415][T11670] CPU: 1 PID: 11670 Comm: syzbot-repro Not tainted 6.9.0-rc6-00053-g0106679839f7 #27
[   61.821191][T11670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.1 11/11/2019
[   61.822911][T11670] Call Trace:
[   61.823632][T11670]  <TASK>
[   61.824525][T11670]  dump_stack_lvl+0x241/0x360
[   61.825545][T11670]  ? tcp_gro_dev_warn+0x260/0x260
[   61.826706][T11670]  ? panic+0x850/0x850
[   61.828594][T11670]  ? lock_release+0x85/0x860
[   61.829749][T11670]  ? vscnprintf+0x5d/0x80
[   61.830951][T11670]  panic+0x335/0x850
[   61.832316][T11670]  ? check_panic_on_warn+0x21/0xa0
[   61.834475][T11670]  ? __memcpy_flushcache+0x2c0/0x2c0
[   61.835809][T11670]  ? _raw_spin_unlock_irqrestore+0xd8/0x140
[   61.838063][T11670]  ? _raw_spin_unlock_irqrestore+0xdd/0x140
[   61.842056][T11670]  ? _raw_spin_unlock+0x40/0x40
[   61.843116][T11670]  ? print_report+0x1cc/0x210
[   61.844527][T11670]  check_panic_on_warn+0x82/0xa0
[   61.845336][T11670]  ? cpu_map_enqueue+0xba/0x370
[   61.846117][T11670]  end_report+0x48/0xa0
[   61.846790][T11670]  kasan_report+0x154/0x180
[   61.847520][T11670]  ? cpu_map_enqueue+0xba/0x370
[   61.848471][T11670]  cpu_map_enqueue+0xba/0x370
[   61.849968][T11670]  xdp_do_redirect+0x685/0xbf0
[   61.850994][T11670]  tun_xdp_act+0xe7/0x9e0
[   61.851703][T11670]  ? __tun_build_skb+0x2e0/0x2e0
[   61.852598][T11670]  tun_build_skb+0xac6/0x1140
[   61.853362][T11670]  ? tun_build_skb+0xb4/0x1140
[   61.854454][T11670]  ? tun_get_user+0x2760/0x2760
[   61.855806][T11670]  tun_get_user+0x7fa/0x2760
[   61.856734][T11670]  ? rcu_read_unlock+0xa0/0xa0
[   61.857502][T11670]  ? tun_get+0x1e/0x2f0
[   61.858171][T11670]  ? tun_get+0x1e/0x2f0
[   61.858952][T11670]  ? tun_get+0x27d/0x2f0
[   61.859637][T11670]  tun_chr_write_iter+0x111/0x1f0
[   61.860913][T11670]  vfs_write+0xa84/0xcb0
[   61.861578][T11670]  ? __lock_acquire+0x1f60/0x1f60
[   61.862376][T11670]  ? kernel_write+0x330/0x330
[   61.863221][T11670]  ? lockdep_hardirqs_on_prepare+0x43c/0x780
[   61.864230][T11670]  ? __fget_files+0x3ea/0x460
[   61.864955][T11670]  ? seqcount_lockdep_reader_access+0x157/0x220
[   61.866571][T11670]  ? __fdget_pos+0x19e/0x320
[   61.867414][T11670]  ksys_write+0x19f/0x2c0
[   61.868263][T11670]  ? __ia32_sys_read+0x90/0x90
[   61.868996][T11670]  ? ktime_get_coarse_real_ts64+0x10b/0x120
[   61.869896][T11670]  do_syscall_64+0xec/0x210
[   61.870592][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
[   61.871595][T11670] RIP: 0033:0x472a4f
[   61.873158][T11670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 c9 d8 02 00 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 0c d9 02 00 48
[   61.876447][T11670] RSP: 002b:00007f7a7a90f5c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[   61.877944][T11670] RAX: ffffffffffffffda RBX: 00007f7a7a911640 RCX: 0000000000472a4f
[   61.879751][T11670] RDX: 0000000000000066 RSI: 0000000020000440 RDI: 00000000000000c8
[   61.881100][T11670] RBP: 00007f7a7a90f620 R08: 0000000000000000 R09: 0000000100000000
[   61.882298][T11670] R10: 0000000100000000 R11: 0000000000000293 R12: 00007f7a7a911640
[   61.883501][T11670] R13: 000000000000006e R14: 000000000042f2f0 R15: 00007f7a7a8f1000
[   61.885999][T11670]  </TASK>

Signed-off-by: Radoslaw Zielonek <radoslaw.zielonek@gmail.com>
---
 kernel/bpf/cpumap.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

Comments

Toke Høiland-Jørgensen July 26, 2024, 6:55 p.m. UTC | #1
Radoslaw Zielonek <radoslaw.zielonek@gmail.com> writes:

> When cpu_map has been redirected, first the pointer to the
> bpf_cpu_map_entry has been copied, then freed, and read from the copy.
> To fix it, this commit introduced the refcount cpu_map_parent during
> redirections to prevent use after free.
>
> syzbot reported:
>
> [   61.581464][T11670] ==================================================================
> [   61.583323][T11670] BUG: KASAN: slab-use-after-free in cpu_map_enqueue+0xba/0x370
> [   61.585419][T11670] Read of size 8 at addr ffff888122d75208 by task syzbot-repro/11670
> [   61.587541][T11670]
> [   61.588237][T11670] CPU: 1 PID: 11670 Comm: syzbot-repro Not tainted 6.9.0-rc6-00053-g0106679839f7 #27
> [   61.590542][T11670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.1 11/11/2019
> [   61.592798][T11670] Call Trace:
> [   61.593885][T11670]  <TASK>
> [   61.594805][T11670]  dump_stack_lvl+0x241/0x360
> [   61.595974][T11670]  ? tcp_gro_dev_warn+0x260/0x260
> [   61.598242][T11670]  ? __wake_up_klogd+0xcc/0x100
> [   61.599407][T11670]  ? panic+0x850/0x850
> [   61.600516][T11670]  ? __virt_addr_valid+0x182/0x510
> [   61.602073][T11670]  ? __virt_addr_valid+0x182/0x510
> [   61.603496][T11670]  print_address_description+0x7b/0x360
> [   61.605170][T11670]  print_report+0xfd/0x210
> [   61.606370][T11670]  ? __virt_addr_valid+0x182/0x510
> [   61.607925][T11670]  ? __virt_addr_valid+0x182/0x510
> [   61.609577][T11670]  ? __virt_addr_valid+0x43d/0x510
> [   61.610948][T11670]  ? __phys_addr+0xb9/0x170
> [   61.612103][T11670]  ? cpu_map_enqueue+0xba/0x370
> [   61.613448][T11670]  kasan_report+0x143/0x180
> [   61.615000][T11670]  ? cpu_map_enqueue+0xba/0x370
> [   61.616181][T11670]  cpu_map_enqueue+0xba/0x370
> [   61.617620][T11670]  xdp_do_redirect+0x685/0xbf0
> [   61.618787][T11670]  tun_xdp_act+0xe7/0x9e0
> [   61.619856][T11670]  ? __tun_build_skb+0x2e0/0x2e0
> [   61.621356][T11670]  tun_build_skb+0xac6/0x1140
> [   61.622602][T11670]  ? tun_build_skb+0xb4/0x1140
> [   61.623880][T11670]  ? tun_get_user+0x2760/0x2760
> [   61.625341][T11670]  tun_get_user+0x7fa/0x2760
> [   61.626532][T11670]  ? rcu_read_unlock+0xa0/0xa0
> [   61.627725][T11670]  ? tun_get+0x1e/0x2f0
> [   61.629147][T11670]  ? tun_get+0x1e/0x2f0
> [   61.630265][T11670]  ? tun_get+0x27d/0x2f0
> [   61.631486][T11670]  tun_chr_write_iter+0x111/0x1f0
> [   61.632855][T11670]  vfs_write+0xa84/0xcb0
> [   61.634185][T11670]  ? __lock_acquire+0x1f60/0x1f60
> [   61.635501][T11670]  ? kernel_write+0x330/0x330
> [   61.636757][T11670]  ? lockdep_hardirqs_on_prepare+0x43c/0x780
> [   61.638445][T11670]  ? __fget_files+0x3ea/0x460
> [   61.639448][T11670]  ? seqcount_lockdep_reader_access+0x157/0x220
> [   61.641217][T11670]  ? __fdget_pos+0x19e/0x320
> [   61.642426][T11670]  ksys_write+0x19f/0x2c0
> [   61.643576][T11670]  ? __ia32_sys_read+0x90/0x90
> [   61.644841][T11670]  ? ktime_get_coarse_real_ts64+0x10b/0x120
> [   61.646549][T11670]  do_syscall_64+0xec/0x210
> [   61.647832][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
> [   61.649485][T11670] RIP: 0033:0x472a4f
> [   61.650539][T11670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 c9 d8 02 00 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 0c d9 02 00 48
> [   61.655476][T11670] RSP: 002b:00007f7a7a90f5c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> [   61.657675][T11670] RAX: ffffffffffffffda RBX: 00007f7a7a911640 RCX: 0000000000472a4f
> [   61.659658][T11670] RDX: 0000000000000066 RSI: 0000000020000440 RDI: 00000000000000c8
> [   61.661980][T11670] RBP: 00007f7a7a90f620 R08: 0000000000000000 R09: 0000000100000000
> [   61.663982][T11670] R10: 0000000100000000 R11: 0000000000000293 R12: 00007f7a7a911640
> [   61.666425][T11670] R13: 000000000000006e R14: 000000000042f2f0 R15: 00007f7a7a8f1000
> [   61.668443][T11670]  </TASK>
> [   61.669233][T11670]
> [   61.669754][T11670] Allocated by task 11643:
> [   61.670855][T11670]  kasan_save_track+0x3f/0x70
> [   61.672094][T11670]  __kasan_kmalloc+0x98/0xb0
> [   61.673466][T11670]  __kmalloc_node+0x259/0x4f0
> [   61.674687][T11670]  bpf_map_kmalloc_node+0xd3/0x1c0
> [   61.676069][T11670]  cpu_map_update_elem+0x2f0/0x1000
> [   61.677619][T11670]  bpf_map_update_value+0x1b2/0x540
> [   61.679006][T11670]  map_update_elem+0x52f/0x6e0
> [   61.680076][T11670]  __sys_bpf+0x7a9/0x850
> [   61.681610][T11670]  __x64_sys_bpf+0x7c/0x90
> [   61.682772][T11670]  do_syscall_64+0xec/0x210
> [   61.683967][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
> [   61.685648][T11670]
> [   61.686282][T11670] Freed by task 1064:
> [   61.687296][T11670]  kasan_save_track+0x3f/0x70
> [   61.688498][T11670]  kasan_save_free_info+0x40/0x50
> [   61.689786][T11670]  poison_slab_object+0xa6/0xe0
> [   61.691059][T11670]  __kasan_slab_free+0x37/0x60
> [   61.692336][T11670]  kfree+0x136/0x2f0
> [   61.693549][T11670]  __cpu_map_entry_free+0x6f3/0x770
> [   61.695004][T11670]  cpu_map_free+0xc0/0x180
> [   61.696191][T11670]  bpf_map_free_deferred+0xe3/0x100
> [   61.697703][T11670]  process_scheduled_works+0x9cb/0x14a0
> [   61.699330][T11670]  worker_thread+0x85c/0xd50
> [   61.700546][T11670]  kthread+0x2ef/0x390
> [   61.701791][T11670]  ret_from_fork+0x4d/0x80
> [   61.702942][T11670]  ret_from_fork_asm+0x11/0x20
> [   61.704195][T11670]
> [   61.704825][T11670] The buggy address belongs to the object at ffff888122d75200
> [   61.704825][T11670]  which belongs to the cache kmalloc-cg-256 of size 256
> [   61.708516][T11670] The buggy address is located 8 bytes inside of
> [   61.708516][T11670]  freed 256-byte region [ffff888122d75200, ffff888122d75300)
> [   61.712215][T11670]
> [   61.712824][T11670] The buggy address belongs to the physical page:
> [   61.714883][T11670] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x122d74
> [   61.717300][T11670] head: order:1 entire_mapcount:0 nr_pages_mapped:0 pincount:0
> [   61.719037][T11670] memcg:ffff888120d85f01
> [   61.720006][T11670] flags: 0x17ff00000000840(slab|head|node=0|zone=2|lastcpupid=0x7ff)
> [   61.722181][T11670] page_type: 0xffffffff()
> [   61.723318][T11670] raw: 017ff00000000840 ffff88810004dcc0 dead000000000122 0000000000000000
> [   61.725650][T11670] raw: 0000000000000000 0000000080100010 00000001ffffffff ffff888120d85f01
> [   61.727943][T11670] head: 017ff00000000840 ffff88810004dcc0 dead000000000122 0000000000000000
> [   61.730237][T11670] head: 0000000000000000 0000000080100010 00000001ffffffff ffff888120d85f01
> [   61.732671][T11670] head: 017ff00000000001 ffffea00048b5d01 dead000000000122 00000000ffffffff
> [   61.735029][T11670] head: 0000000200000000 0000000000000000 00000000ffffffff 0000000000000000
> [   61.737400][T11670] page dumped because: kasan: bad access detected
> [   61.740100][T11670] page_owner tracks the page as allocated
> [   61.743121][T11670] page last allocated via order 1, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 8343, tgid -2092279795 (syzbot-repro), ts 8343, free_ts 43505720198
> [   61.754038][T11670]  post_alloc_hook+0x1e6/0x210
> [   61.756046][T11670]  get_page_from_freelist+0x7d2/0x850
> [   61.759460][T11670]  __alloc_pages+0x25e/0x580
> [   61.761428][T11670]  alloc_slab_page+0x6b/0x1a0
> [   61.764199][T11670]  allocate_slab+0x5d/0x200
> [   61.766122][T11670]  ___slab_alloc+0xac5/0xf20
> [   61.767195][T11670]  __kmalloc+0x2e0/0x4b0
> [   61.769028][T11670]  fib_default_rule_add+0x4a/0x350
> [   61.770394][T11670]  fib6_rules_net_init+0x42/0x100
> [   61.771731][T11670]  ops_init+0x39d/0x670
> [   61.773061][T11670]  setup_net+0x3bc/0xae0
> [   61.774102][T11670]  copy_net_ns+0x399/0x5e0
> [   61.775628][T11670]  create_new_namespaces+0x4de/0x8d0
> [   61.776950][T11670]  unshare_nsproxy_namespaces+0x127/0x190
> [   61.778352][T11670]  ksys_unshare+0x5e6/0xbf0
> [   61.779741][T11670]  __x64_sys_unshare+0x38/0x40
> [   61.781302][T11670] page last free pid 4619 tgid 4619 stack trace:
> [   61.783542][T11670]  free_unref_page_prepare+0x72f/0x7c0
> [   61.785018][T11670]  free_unref_page+0x37/0x3f0
> [   61.786030][T11670]  __slab_free+0x351/0x3f0
> [   61.786991][T11670]  qlist_free_all+0x60/0xd0
> [   61.788827][T11670]  kasan_quarantine_reduce+0x15a/0x170
> [   61.789951][T11670]  __kasan_slab_alloc+0x23/0x70
> [   61.790999][T11670]  kmem_cache_alloc_node+0x193/0x390
> [   61.792331][T11670]  kmalloc_reserve+0xa7/0x2a0
> [   61.793345][T11670]  __alloc_skb+0x1ec/0x430
> [   61.794435][T11670]  netlink_sendmsg+0x615/0xc80
> [   61.796439][T11670]  __sock_sendmsg+0x21f/0x270
> [   61.797467][T11670]  ____sys_sendmsg+0x540/0x860
> [   61.798505][T11670]  __sys_sendmsg+0x2b7/0x3a0
> [   61.799512][T11670]  do_syscall_64+0xec/0x210
> [   61.800674][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
> [   61.802021][T11670]
> [   61.802526][T11670] Memory state around the buggy address:
> [   61.803701][T11670]  ffff888122d75100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> [   61.805694][T11670]  ffff888122d75180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> [   61.808104][T11670] >ffff888122d75200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [   61.809769][T11670]                       ^
> [   61.810672][T11670]  ffff888122d75280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [   61.812532][T11670]  ffff888122d75300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> [   61.814846][T11670] ==================================================================
> [   61.816914][T11670] Kernel panic - not syncing: KASAN: panic_on_warn set ...
> [   61.818415][T11670] CPU: 1 PID: 11670 Comm: syzbot-repro Not tainted 6.9.0-rc6-00053-g0106679839f7 #27
> [   61.821191][T11670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.1 11/11/2019
> [   61.822911][T11670] Call Trace:
> [   61.823632][T11670]  <TASK>
> [   61.824525][T11670]  dump_stack_lvl+0x241/0x360
> [   61.825545][T11670]  ? tcp_gro_dev_warn+0x260/0x260
> [   61.826706][T11670]  ? panic+0x850/0x850
> [   61.828594][T11670]  ? lock_release+0x85/0x860
> [   61.829749][T11670]  ? vscnprintf+0x5d/0x80
> [   61.830951][T11670]  panic+0x335/0x850
> [   61.832316][T11670]  ? check_panic_on_warn+0x21/0xa0
> [   61.834475][T11670]  ? __memcpy_flushcache+0x2c0/0x2c0
> [   61.835809][T11670]  ? _raw_spin_unlock_irqrestore+0xd8/0x140
> [   61.838063][T11670]  ? _raw_spin_unlock_irqrestore+0xdd/0x140
> [   61.842056][T11670]  ? _raw_spin_unlock+0x40/0x40
> [   61.843116][T11670]  ? print_report+0x1cc/0x210
> [   61.844527][T11670]  check_panic_on_warn+0x82/0xa0
> [   61.845336][T11670]  ? cpu_map_enqueue+0xba/0x370
> [   61.846117][T11670]  end_report+0x48/0xa0
> [   61.846790][T11670]  kasan_report+0x154/0x180
> [   61.847520][T11670]  ? cpu_map_enqueue+0xba/0x370
> [   61.848471][T11670]  cpu_map_enqueue+0xba/0x370
> [   61.849968][T11670]  xdp_do_redirect+0x685/0xbf0
> [   61.850994][T11670]  tun_xdp_act+0xe7/0x9e0
> [   61.851703][T11670]  ? __tun_build_skb+0x2e0/0x2e0
> [   61.852598][T11670]  tun_build_skb+0xac6/0x1140
> [   61.853362][T11670]  ? tun_build_skb+0xb4/0x1140
> [   61.854454][T11670]  ? tun_get_user+0x2760/0x2760
> [   61.855806][T11670]  tun_get_user+0x7fa/0x2760
> [   61.856734][T11670]  ? rcu_read_unlock+0xa0/0xa0
> [   61.857502][T11670]  ? tun_get+0x1e/0x2f0
> [   61.858171][T11670]  ? tun_get+0x1e/0x2f0
> [   61.858952][T11670]  ? tun_get+0x27d/0x2f0
> [   61.859637][T11670]  tun_chr_write_iter+0x111/0x1f0
> [   61.860913][T11670]  vfs_write+0xa84/0xcb0
> [   61.861578][T11670]  ? __lock_acquire+0x1f60/0x1f60
> [   61.862376][T11670]  ? kernel_write+0x330/0x330
> [   61.863221][T11670]  ? lockdep_hardirqs_on_prepare+0x43c/0x780
> [   61.864230][T11670]  ? __fget_files+0x3ea/0x460
> [   61.864955][T11670]  ? seqcount_lockdep_reader_access+0x157/0x220
> [   61.866571][T11670]  ? __fdget_pos+0x19e/0x320
> [   61.867414][T11670]  ksys_write+0x19f/0x2c0
> [   61.868263][T11670]  ? __ia32_sys_read+0x90/0x90
> [   61.868996][T11670]  ? ktime_get_coarse_real_ts64+0x10b/0x120
> [   61.869896][T11670]  do_syscall_64+0xec/0x210
> [   61.870592][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
> [   61.871595][T11670] RIP: 0033:0x472a4f
> [   61.873158][T11670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 c9 d8 02 00 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 0c d9 02 00 48
> [   61.876447][T11670] RSP: 002b:00007f7a7a90f5c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> [   61.877944][T11670] RAX: ffffffffffffffda RBX: 00007f7a7a911640 RCX: 0000000000472a4f
> [   61.879751][T11670] RDX: 0000000000000066 RSI: 0000000020000440 RDI: 00000000000000c8
> [   61.881100][T11670] RBP: 00007f7a7a90f620 R08: 0000000000000000 R09: 0000000100000000
> [   61.882298][T11670] R10: 0000000100000000 R11: 0000000000000293 R12: 00007f7a7a911640
> [   61.883501][T11670] R13: 000000000000006e R14: 000000000042f2f0 R15: 00007f7a7a8f1000
> [   61.885999][T11670]  </TASK>
>
> Signed-off-by: Radoslaw Zielonek <radoslaw.zielonek@gmail.com>
> ---
>  kernel/bpf/cpumap.c | 22 ++++++++++++++++++++++
>  1 file changed, 22 insertions(+)
>
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index a8e34416e960..0034a6d423b6 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -59,6 +59,9 @@ struct bpf_cpu_map_entry {
>  	u32 cpu;    /* kthread CPU and map index */
>  	int map_id; /* Back reference to map */
>  
> +	/* Used to end ownership transfer transaction */
> +	struct bpf_map *parent_map;
> +
>  	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
>  	struct xdp_bulk_queue __percpu *bulkq;
>  
> @@ -427,6 +430,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>  	rcpu->cpu    = cpu;
>  	rcpu->map_id = map->id;
>  	rcpu->value.qsize  = value->qsize;
> +	rcpu->parent_map = map;
>  
>  	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
>  		goto free_ptr_ring;
> @@ -639,6 +643,14 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  
>  static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
>  {
> +	/*
> +	 * Redirection is a transfer of ownership of the bpf_cpu_map_entry
> +	 * During the transfer the bpf_cpu_map_entry is still in the map,
> +	 * so we need to prevent it from being freed.
> +	 * The bpf_map_inc() increments the refcnt of the map, so the
> +	 * bpf_cpu_map_entry will not be freed until the refcnt is decremented.
> +	 */
> +	bpf_map_inc(map);

Adding refcnt increase/decrease in the fast path? Hard NAK.

The map entry is protected by RCU, which should prevent this kind of UAF
from happening. Looks like maybe there's a bug in the tun driver so this
RCU protection is not working?

-Toke
Yonghong Song July 26, 2024, 7:11 p.m. UTC | #2
On 7/26/24 11:55 AM, Toke Høiland-Jørgensen wrote:
> Radoslaw Zielonek <radoslaw.zielonek@gmail.com> writes:
>
>> When cpu_map has been redirected, first the pointer to the
>> bpf_cpu_map_entry has been copied, then freed, and read from the copy.
>> To fix it, this commit introduced the refcount cpu_map_parent during
>> redirections to prevent use after free.
>>
>> syzbot reported:
>>
>> [   61.581464][T11670] ==================================================================
>> [   61.583323][T11670] BUG: KASAN: slab-use-after-free in cpu_map_enqueue+0xba/0x370
>> [   61.585419][T11670] Read of size 8 at addr ffff888122d75208 by task syzbot-repro/11670
>> [   61.587541][T11670]
>> [   61.588237][T11670] CPU: 1 PID: 11670 Comm: syzbot-repro Not tainted 6.9.0-rc6-00053-g0106679839f7 #27
>> [   61.590542][T11670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.1 11/11/2019
>> [   61.592798][T11670] Call Trace:
>> [   61.593885][T11670]  <TASK>
>> [   61.594805][T11670]  dump_stack_lvl+0x241/0x360
>> [   61.595974][T11670]  ? tcp_gro_dev_warn+0x260/0x260
>> [   61.598242][T11670]  ? __wake_up_klogd+0xcc/0x100
>> [   61.599407][T11670]  ? panic+0x850/0x850
>> [   61.600516][T11670]  ? __virt_addr_valid+0x182/0x510
>> [   61.602073][T11670]  ? __virt_addr_valid+0x182/0x510
>> [   61.603496][T11670]  print_address_description+0x7b/0x360
>> [   61.605170][T11670]  print_report+0xfd/0x210
>> [   61.606370][T11670]  ? __virt_addr_valid+0x182/0x510
>> [   61.607925][T11670]  ? __virt_addr_valid+0x182/0x510
>> [   61.609577][T11670]  ? __virt_addr_valid+0x43d/0x510
>> [   61.610948][T11670]  ? __phys_addr+0xb9/0x170
>> [   61.612103][T11670]  ? cpu_map_enqueue+0xba/0x370
>> [   61.613448][T11670]  kasan_report+0x143/0x180
>> [   61.615000][T11670]  ? cpu_map_enqueue+0xba/0x370
>> [   61.616181][T11670]  cpu_map_enqueue+0xba/0x370
>> [   61.617620][T11670]  xdp_do_redirect+0x685/0xbf0
>> [   61.618787][T11670]  tun_xdp_act+0xe7/0x9e0
>> [   61.619856][T11670]  ? __tun_build_skb+0x2e0/0x2e0
>> [   61.621356][T11670]  tun_build_skb+0xac6/0x1140
>> [   61.622602][T11670]  ? tun_build_skb+0xb4/0x1140
>> [   61.623880][T11670]  ? tun_get_user+0x2760/0x2760
>> [   61.625341][T11670]  tun_get_user+0x7fa/0x2760
>> [   61.626532][T11670]  ? rcu_read_unlock+0xa0/0xa0
>> [   61.627725][T11670]  ? tun_get+0x1e/0x2f0
>> [   61.629147][T11670]  ? tun_get+0x1e/0x2f0
>> [   61.630265][T11670]  ? tun_get+0x27d/0x2f0
>> [   61.631486][T11670]  tun_chr_write_iter+0x111/0x1f0
>> [   61.632855][T11670]  vfs_write+0xa84/0xcb0
>> [   61.634185][T11670]  ? __lock_acquire+0x1f60/0x1f60
>> [   61.635501][T11670]  ? kernel_write+0x330/0x330
>> [   61.636757][T11670]  ? lockdep_hardirqs_on_prepare+0x43c/0x780
>> [   61.638445][T11670]  ? __fget_files+0x3ea/0x460
>> [   61.639448][T11670]  ? seqcount_lockdep_reader_access+0x157/0x220
>> [   61.641217][T11670]  ? __fdget_pos+0x19e/0x320
>> [   61.642426][T11670]  ksys_write+0x19f/0x2c0
>> [   61.643576][T11670]  ? __ia32_sys_read+0x90/0x90
>> [   61.644841][T11670]  ? ktime_get_coarse_real_ts64+0x10b/0x120
>> [   61.646549][T11670]  do_syscall_64+0xec/0x210
>> [   61.647832][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
>> [   61.649485][T11670] RIP: 0033:0x472a4f
>> [   61.650539][T11670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 c9 d8 02 00 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 0c d9 02 00 48
>> [   61.655476][T11670] RSP: 002b:00007f7a7a90f5c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
>> [   61.657675][T11670] RAX: ffffffffffffffda RBX: 00007f7a7a911640 RCX: 0000000000472a4f
>> [   61.659658][T11670] RDX: 0000000000000066 RSI: 0000000020000440 RDI: 00000000000000c8
>> [   61.661980][T11670] RBP: 00007f7a7a90f620 R08: 0000000000000000 R09: 0000000100000000
>> [   61.663982][T11670] R10: 0000000100000000 R11: 0000000000000293 R12: 00007f7a7a911640
>> [   61.666425][T11670] R13: 000000000000006e R14: 000000000042f2f0 R15: 00007f7a7a8f1000
>> [   61.668443][T11670]  </TASK>
>> [   61.669233][T11670]
>> [   61.669754][T11670] Allocated by task 11643:
>> [   61.670855][T11670]  kasan_save_track+0x3f/0x70
>> [   61.672094][T11670]  __kasan_kmalloc+0x98/0xb0
>> [   61.673466][T11670]  __kmalloc_node+0x259/0x4f0
>> [   61.674687][T11670]  bpf_map_kmalloc_node+0xd3/0x1c0
>> [   61.676069][T11670]  cpu_map_update_elem+0x2f0/0x1000
>> [   61.677619][T11670]  bpf_map_update_value+0x1b2/0x540
>> [   61.679006][T11670]  map_update_elem+0x52f/0x6e0
>> [   61.680076][T11670]  __sys_bpf+0x7a9/0x850
>> [   61.681610][T11670]  __x64_sys_bpf+0x7c/0x90
>> [   61.682772][T11670]  do_syscall_64+0xec/0x210
>> [   61.683967][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
>> [   61.685648][T11670]
>> [   61.686282][T11670] Freed by task 1064:
>> [   61.687296][T11670]  kasan_save_track+0x3f/0x70
>> [   61.688498][T11670]  kasan_save_free_info+0x40/0x50
>> [   61.689786][T11670]  poison_slab_object+0xa6/0xe0
>> [   61.691059][T11670]  __kasan_slab_free+0x37/0x60
>> [   61.692336][T11670]  kfree+0x136/0x2f0
>> [   61.693549][T11670]  __cpu_map_entry_free+0x6f3/0x770
>> [   61.695004][T11670]  cpu_map_free+0xc0/0x180
>> [   61.696191][T11670]  bpf_map_free_deferred+0xe3/0x100
>> [   61.697703][T11670]  process_scheduled_works+0x9cb/0x14a0
>> [   61.699330][T11670]  worker_thread+0x85c/0xd50
>> [   61.700546][T11670]  kthread+0x2ef/0x390
>> [   61.701791][T11670]  ret_from_fork+0x4d/0x80
>> [   61.702942][T11670]  ret_from_fork_asm+0x11/0x20
>> [   61.704195][T11670]
>> [   61.704825][T11670] The buggy address belongs to the object at ffff888122d75200
>> [   61.704825][T11670]  which belongs to the cache kmalloc-cg-256 of size 256
>> [   61.708516][T11670] The buggy address is located 8 bytes inside of
>> [   61.708516][T11670]  freed 256-byte region [ffff888122d75200, ffff888122d75300)
>> [   61.712215][T11670]
>> [   61.712824][T11670] The buggy address belongs to the physical page:
>> [   61.714883][T11670] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x122d74
>> [   61.717300][T11670] head: order:1 entire_mapcount:0 nr_pages_mapped:0 pincount:0
>> [   61.719037][T11670] memcg:ffff888120d85f01
>> [   61.720006][T11670] flags: 0x17ff00000000840(slab|head|node=0|zone=2|lastcpupid=0x7ff)
>> [   61.722181][T11670] page_type: 0xffffffff()
>> [   61.723318][T11670] raw: 017ff00000000840 ffff88810004dcc0 dead000000000122 0000000000000000
>> [   61.725650][T11670] raw: 0000000000000000 0000000080100010 00000001ffffffff ffff888120d85f01
>> [   61.727943][T11670] head: 017ff00000000840 ffff88810004dcc0 dead000000000122 0000000000000000
>> [   61.730237][T11670] head: 0000000000000000 0000000080100010 00000001ffffffff ffff888120d85f01
>> [   61.732671][T11670] head: 017ff00000000001 ffffea00048b5d01 dead000000000122 00000000ffffffff
>> [   61.735029][T11670] head: 0000000200000000 0000000000000000 00000000ffffffff 0000000000000000
>> [   61.737400][T11670] page dumped because: kasan: bad access detected
>> [   61.740100][T11670] page_owner tracks the page as allocated
>> [   61.743121][T11670] page last allocated via order 1, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 8343, tgid -2092279795 (syzbot-repro), ts 8343, free_ts 43505720198
>> [   61.754038][T11670]  post_alloc_hook+0x1e6/0x210
>> [   61.756046][T11670]  get_page_from_freelist+0x7d2/0x850
>> [   61.759460][T11670]  __alloc_pages+0x25e/0x580
>> [   61.761428][T11670]  alloc_slab_page+0x6b/0x1a0
>> [   61.764199][T11670]  allocate_slab+0x5d/0x200
>> [   61.766122][T11670]  ___slab_alloc+0xac5/0xf20
>> [   61.767195][T11670]  __kmalloc+0x2e0/0x4b0
>> [   61.769028][T11670]  fib_default_rule_add+0x4a/0x350
>> [   61.770394][T11670]  fib6_rules_net_init+0x42/0x100
>> [   61.771731][T11670]  ops_init+0x39d/0x670
>> [   61.773061][T11670]  setup_net+0x3bc/0xae0
>> [   61.774102][T11670]  copy_net_ns+0x399/0x5e0
>> [   61.775628][T11670]  create_new_namespaces+0x4de/0x8d0
>> [   61.776950][T11670]  unshare_nsproxy_namespaces+0x127/0x190
>> [   61.778352][T11670]  ksys_unshare+0x5e6/0xbf0
>> [   61.779741][T11670]  __x64_sys_unshare+0x38/0x40
>> [   61.781302][T11670] page last free pid 4619 tgid 4619 stack trace:
>> [   61.783542][T11670]  free_unref_page_prepare+0x72f/0x7c0
>> [   61.785018][T11670]  free_unref_page+0x37/0x3f0
>> [   61.786030][T11670]  __slab_free+0x351/0x3f0
>> [   61.786991][T11670]  qlist_free_all+0x60/0xd0
>> [   61.788827][T11670]  kasan_quarantine_reduce+0x15a/0x170
>> [   61.789951][T11670]  __kasan_slab_alloc+0x23/0x70
>> [   61.790999][T11670]  kmem_cache_alloc_node+0x193/0x390
>> [   61.792331][T11670]  kmalloc_reserve+0xa7/0x2a0
>> [   61.793345][T11670]  __alloc_skb+0x1ec/0x430
>> [   61.794435][T11670]  netlink_sendmsg+0x615/0xc80
>> [   61.796439][T11670]  __sock_sendmsg+0x21f/0x270
>> [   61.797467][T11670]  ____sys_sendmsg+0x540/0x860
>> [   61.798505][T11670]  __sys_sendmsg+0x2b7/0x3a0
>> [   61.799512][T11670]  do_syscall_64+0xec/0x210
>> [   61.800674][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
>> [   61.802021][T11670]
>> [   61.802526][T11670] Memory state around the buggy address:
>> [   61.803701][T11670]  ffff888122d75100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>> [   61.805694][T11670]  ffff888122d75180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>> [   61.808104][T11670] >ffff888122d75200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> [   61.809769][T11670]                       ^
>> [   61.810672][T11670]  ffff888122d75280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> [   61.812532][T11670]  ffff888122d75300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>> [   61.814846][T11670] ==================================================================
>> [   61.816914][T11670] Kernel panic - not syncing: KASAN: panic_on_warn set ...
>> [   61.818415][T11670] CPU: 1 PID: 11670 Comm: syzbot-repro Not tainted 6.9.0-rc6-00053-g0106679839f7 #27
>> [   61.821191][T11670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.1 11/11/2019
>> [   61.822911][T11670] Call Trace:
>> [   61.823632][T11670]  <TASK>
>> [   61.824525][T11670]  dump_stack_lvl+0x241/0x360
>> [   61.825545][T11670]  ? tcp_gro_dev_warn+0x260/0x260
>> [   61.826706][T11670]  ? panic+0x850/0x850
>> [   61.828594][T11670]  ? lock_release+0x85/0x860
>> [   61.829749][T11670]  ? vscnprintf+0x5d/0x80
>> [   61.830951][T11670]  panic+0x335/0x850
>> [   61.832316][T11670]  ? check_panic_on_warn+0x21/0xa0
>> [   61.834475][T11670]  ? __memcpy_flushcache+0x2c0/0x2c0
>> [   61.835809][T11670]  ? _raw_spin_unlock_irqrestore+0xd8/0x140
>> [   61.838063][T11670]  ? _raw_spin_unlock_irqrestore+0xdd/0x140
>> [   61.842056][T11670]  ? _raw_spin_unlock+0x40/0x40
>> [   61.843116][T11670]  ? print_report+0x1cc/0x210
>> [   61.844527][T11670]  check_panic_on_warn+0x82/0xa0
>> [   61.845336][T11670]  ? cpu_map_enqueue+0xba/0x370
>> [   61.846117][T11670]  end_report+0x48/0xa0
>> [   61.846790][T11670]  kasan_report+0x154/0x180
>> [   61.847520][T11670]  ? cpu_map_enqueue+0xba/0x370
>> [   61.848471][T11670]  cpu_map_enqueue+0xba/0x370
>> [   61.849968][T11670]  xdp_do_redirect+0x685/0xbf0
>> [   61.850994][T11670]  tun_xdp_act+0xe7/0x9e0
>> [   61.851703][T11670]  ? __tun_build_skb+0x2e0/0x2e0
>> [   61.852598][T11670]  tun_build_skb+0xac6/0x1140
>> [   61.853362][T11670]  ? tun_build_skb+0xb4/0x1140
>> [   61.854454][T11670]  ? tun_get_user+0x2760/0x2760
>> [   61.855806][T11670]  tun_get_user+0x7fa/0x2760
>> [   61.856734][T11670]  ? rcu_read_unlock+0xa0/0xa0
>> [   61.857502][T11670]  ? tun_get+0x1e/0x2f0
>> [   61.858171][T11670]  ? tun_get+0x1e/0x2f0
>> [   61.858952][T11670]  ? tun_get+0x27d/0x2f0
>> [   61.859637][T11670]  tun_chr_write_iter+0x111/0x1f0
>> [   61.860913][T11670]  vfs_write+0xa84/0xcb0
>> [   61.861578][T11670]  ? __lock_acquire+0x1f60/0x1f60
>> [   61.862376][T11670]  ? kernel_write+0x330/0x330
>> [   61.863221][T11670]  ? lockdep_hardirqs_on_prepare+0x43c/0x780
>> [   61.864230][T11670]  ? __fget_files+0x3ea/0x460
>> [   61.864955][T11670]  ? seqcount_lockdep_reader_access+0x157/0x220
>> [   61.866571][T11670]  ? __fdget_pos+0x19e/0x320
>> [   61.867414][T11670]  ksys_write+0x19f/0x2c0
>> [   61.868263][T11670]  ? __ia32_sys_read+0x90/0x90
>> [   61.868996][T11670]  ? ktime_get_coarse_real_ts64+0x10b/0x120
>> [   61.869896][T11670]  do_syscall_64+0xec/0x210
>> [   61.870592][T11670]  entry_SYSCALL_64_after_hwframe+0x67/0x6f
>> [   61.871595][T11670] RIP: 0033:0x472a4f
>> [   61.873158][T11670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 c9 d8 02 00 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 0c d9 02 00 48
>> [   61.876447][T11670] RSP: 002b:00007f7a7a90f5c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
>> [   61.877944][T11670] RAX: ffffffffffffffda RBX: 00007f7a7a911640 RCX: 0000000000472a4f
>> [   61.879751][T11670] RDX: 0000000000000066 RSI: 0000000020000440 RDI: 00000000000000c8
>> [   61.881100][T11670] RBP: 00007f7a7a90f620 R08: 0000000000000000 R09: 0000000100000000
>> [   61.882298][T11670] R10: 0000000100000000 R11: 0000000000000293 R12: 00007f7a7a911640
>> [   61.883501][T11670] R13: 000000000000006e R14: 000000000042f2f0 R15: 00007f7a7a8f1000
>> [   61.885999][T11670]  </TASK>
>>
>> Signed-off-by: Radoslaw Zielonek <radoslaw.zielonek@gmail.com>
>> ---
>>   kernel/bpf/cpumap.c | 22 ++++++++++++++++++++++
>>   1 file changed, 22 insertions(+)
>>
>> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
>> index a8e34416e960..0034a6d423b6 100644
>> --- a/kernel/bpf/cpumap.c
>> +++ b/kernel/bpf/cpumap.c
>> @@ -59,6 +59,9 @@ struct bpf_cpu_map_entry {
>>   	u32 cpu;    /* kthread CPU and map index */
>>   	int map_id; /* Back reference to map */
>>   
>> +	/* Used to end ownership transfer transaction */
>> +	struct bpf_map *parent_map;
>> +
>>   	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
>>   	struct xdp_bulk_queue __percpu *bulkq;
>>   
>> @@ -427,6 +430,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>>   	rcpu->cpu    = cpu;
>>   	rcpu->map_id = map->id;
>>   	rcpu->value.qsize  = value->qsize;
>> +	rcpu->parent_map = map;

Is it possible that we increase map reference count here and decrease reference count at __cpu_map_entry_free()?

>>   
>>   	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
>>   		goto free_ptr_ring;
>> @@ -639,6 +643,14 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>>   
>>   static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
>>   {
>> +	/*
>> +	 * Redirection is a transfer of ownership of the bpf_cpu_map_entry
>> +	 * During the transfer the bpf_cpu_map_entry is still in the map,
>> +	 * so we need to prevent it from being freed.
>> +	 * The bpf_map_inc() increments the refcnt of the map, so the
>> +	 * bpf_cpu_map_entry will not be freed until the refcnt is decremented.
>> +	 */
>> +	bpf_map_inc(map);
> Adding refcnt increase/decrease in the fast path? Hard NAK.
>
> The map entry is protected by RCU, which should prevent this kind of UAF
> from happening. Looks like maybe there's a bug in the tun driver so this
> RCU protection is not working?
>
> -Toke
Hou Tao July 27, 2024, 4:36 a.m. UTC | #3
On 7/27/2024 2:55 AM, Toke Høiland-Jørgensen wrote:
> Radoslaw Zielonek <radoslaw.zielonek@gmail.com> writes:
>
>> When cpu_map has been redirected, first the pointer to the
>> bpf_cpu_map_entry has been copied, then freed, and read from the copy.
>> To fix it, this commit introduced the refcount cpu_map_parent during
>> redirections to prevent use after free.
>>
>> syzbot reported:
>>
>> [   61.581464][T11670] ==================================================================
>> [   61.583323][T11670] BUG: KASAN: slab-use-after-free in cpu_map_enqueue+0xba/0x370
>> [   61.585419][T11670] Read of size 8 at addr ffff888122d75208 by task syzbot-repro/11670
>> [   61.587541][T11670]
>> [   61.588237][T11670] CPU: 1 PID: 11670 Comm: syzbot-repro Not tainted 6.9.0-rc6-00053-g0106679839f7 #27
>> [   61.590542][T11670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.1 11/11/2019

SNIP
>> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
>> index a8e34416e960..0034a6d423b6 100644
>> --- a/kernel/bpf/cpumap.c
>> +++ b/kernel/bpf/cpumap.c
>> @@ -59,6 +59,9 @@ struct bpf_cpu_map_entry {
>>  	u32 cpu;    /* kthread CPU and map index */
>>  	int map_id; /* Back reference to map */
>>  
>> +	/* Used to end ownership transfer transaction */
>> +	struct bpf_map *parent_map;
>> +
>>  	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
>>  	struct xdp_bulk_queue __percpu *bulkq;
>>  
>> @@ -427,6 +430,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>>  	rcpu->cpu    = cpu;
>>  	rcpu->map_id = map->id;
>>  	rcpu->value.qsize  = value->qsize;
>> +	rcpu->parent_map = map;
>>  
>>  	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
>>  		goto free_ptr_ring;
>> @@ -639,6 +643,14 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>>  
>>  static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
>>  {
>> +	/*
>> +	 * Redirection is a transfer of ownership of the bpf_cpu_map_entry
>> +	 * During the transfer the bpf_cpu_map_entry is still in the map,
>> +	 * so we need to prevent it from being freed.
>> +	 * The bpf_map_inc() increments the refcnt of the map, so the
>> +	 * bpf_cpu_map_entry will not be freed until the refcnt is decremented.
>> +	 */
>> +	bpf_map_inc(map);
> Adding refcnt increase/decrease in the fast path? Hard NAK.
>
> The map entry is protected by RCU, which should prevent this kind of UAF
> from happening. Looks like maybe there's a bug in the tun driver so this
> RCU protection is not working?

It will be possible if two different xdp programs set and use the value
of ri->tgt_vlaue separately as shown below:

(1) on CPU 0: xdp program A invokes bpf_redirect_map() (e.g., through
test_run) and sets ri->tgt_value as one entry in cpu map X
(2) release the xdp program A and the cpu map X is freed.
(3) on CPU 0: xdp program B doesn't invoke bpf_redirect_map(), but it
returns XDP_REDIRECT, so the old value of ri->tgt_value is used by
program B.

I think the problem is fixed after the merge of commit 401cb7dae813
("net: Reference bpf_redirect_info via task_struct on PREEMPT_RT"). 
Before the commit, bpf_redirect_info is a per-cpu variable, and
ri->tgt_value is not cleared when the running of xdp program completes,
so it is possible that one xdp program could use a stale tgt_values set
by other xdp program. After changing bpf_redirect_info into a per-thread
variable and clearing it after each run of xdp program, such sharing
will be impossible.

Zielonek, could you please help to check whether or not the problem is
reproducible in latest bpf-next tree ?

>
> -Toke
>
>
> .
diff mbox series

Patch

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a8e34416e960..0034a6d423b6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -59,6 +59,9 @@  struct bpf_cpu_map_entry {
 	u32 cpu;    /* kthread CPU and map index */
 	int map_id; /* Back reference to map */
 
+	/* Used to end ownership transfer transaction */
+	struct bpf_map *parent_map;
+
 	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
 	struct xdp_bulk_queue __percpu *bulkq;
 
@@ -427,6 +430,7 @@  __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
 	rcpu->cpu    = cpu;
 	rcpu->map_id = map->id;
 	rcpu->value.qsize  = value->qsize;
+	rcpu->parent_map = map;
 
 	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
 		goto free_ptr_ring;
@@ -639,6 +643,14 @@  static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
+	/*
+	 * Redirection is a transfer of ownership of the bpf_cpu_map_entry
+	 * During the transfer the bpf_cpu_map_entry is still in the map,
+	 * so we need to prevent it from being freed.
+	 * The bpf_map_inc() increments the refcnt of the map, so the
+	 * bpf_cpu_map_entry will not be freed until the refcnt is decremented.
+	 */
+	bpf_map_inc(map);
 	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __cpu_map_lookup_elem);
 }
@@ -764,6 +776,16 @@  void __cpu_map_flush(void)
 	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
 		bq_flush_to_queue(bq);
 
+		/*
+		 * Flush operation is the last operation of ownership transfer
+		 * transaction. Thus, we can safely clear the parent_map, decrement
+		 * the refcnt of the map and free the bpf_cpu_map_entry if needed.
+		 */
+		struct bpf_map *map = bq->obj->parent_map;
+
+		if (map)
+			bpf_map_put(map);
+
 		/* If already running, costs spin_lock_irqsave + smb_mb */
 		wake_up_process(bq->obj->kthread);
 	}