diff mbox series

[net-next] bpf: avoid hashtab deadlock with try_lock

Message ID 20221121100521.56601-2-xiangxia.m.yue@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [net-next] bpf: avoid hashtab deadlock with try_lock | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 12 this patch: 12
netdev/cc_maintainers warning 1 maintainers not CCed: bpf@vger.kernel.org
netdev/build_clang success Errors and warnings before: 5 this patch: 5
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 12 this patch: 12
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 302 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline warning Was 2 now: 2
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 pending Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-23 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 fail Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc

Commit Message

Tonghao Zhang Nov. 21, 2022, 10:05 a.m. UTC
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>

The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
try to fix deadlock, but in some case, the deadlock occurs:

* CPUn in task context with K1, and taking lock.
* CPUn interrupted by NMI context, with K2.
* They are using the same bucket, but different map_locked.

	    | Task
	    |
	+---v----+
	|  CPUn  |
	+---^----+
	    |
	    | NMI

Anyway, the lockdep still warn:
[   36.092222] ================================
[   36.092230] WARNING: inconsistent lock state
[   36.092234] 6.1.0-rc5+ #81 Tainted: G            E
[   36.092236] --------------------------------
[   36.092237] inconsistent {INITIAL USE} -> {IN-NMI} usage.
[   36.092238] perf/1515 [HC1[1]:SC0[0]:HE0:SE1] takes:
[   36.092242] ffff888341acd1a0 (&htab->lockdep_key){....}-{2:2}, at: htab_lock_bucket+0x4d/0x58
[   36.092253] {INITIAL USE} state was registered at:
[   36.092255]   mark_usage+0x1d/0x11d
[   36.092262]   __lock_acquire+0x3c9/0x6ed
[   36.092266]   lock_acquire+0x23d/0x29a
[   36.092270]   _raw_spin_lock_irqsave+0x43/0x7f
[   36.092274]   htab_lock_bucket+0x4d/0x58
[   36.092276]   htab_map_delete_elem+0x82/0xfb
[   36.092278]   map_delete_elem+0x156/0x1ac
[   36.092282]   __sys_bpf+0x138/0xb71
[   36.092285]   __do_sys_bpf+0xd/0x15
[   36.092288]   do_syscall_64+0x6d/0x84
[   36.092291]   entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   36.092295] irq event stamp: 120346
[   36.092296] hardirqs last  enabled at (120345): [<ffffffff8180b97f>] _raw_spin_unlock_irq+0x24/0x39
[   36.092299] hardirqs last disabled at (120346): [<ffffffff81169e85>] generic_exec_single+0x40/0xb9
[   36.092303] softirqs last  enabled at (120268): [<ffffffff81c00347>] __do_softirq+0x347/0x387
[   36.092307] softirqs last disabled at (120133): [<ffffffff810ba4f0>] __irq_exit_rcu+0x67/0xc6
[   36.092311]
[   36.092311] other info that might help us debug this:
[   36.092312]  Possible unsafe locking scenario:
[   36.092312]
[   36.092313]        CPU0
[   36.092313]        ----
[   36.092314]   lock(&htab->lockdep_key);
[   36.092315]   <Interrupt>
[   36.092316]     lock(&htab->lockdep_key);
[   36.092318]
[   36.092318]  *** DEADLOCK ***
[   36.092318]
[   36.092318] 3 locks held by perf/1515:
[   36.092320]  #0: ffff8881b9805cc0 (&cpuctx_mutex){+.+.}-{4:4}, at: perf_event_ctx_lock_nested+0x8e/0xba
[   36.092327]  #1: ffff8881075ecc20 (&event->child_mutex){+.+.}-{4:4}, at: perf_event_for_each_child+0x35/0x76
[   36.092332]  #2: ffff8881b9805c20 (&cpuctx_lock){-.-.}-{2:2}, at: perf_ctx_lock+0x12/0x27
[   36.092339]
[   36.092339] stack backtrace:
[   36.092341] CPU: 0 PID: 1515 Comm: perf Tainted: G            E      6.1.0-rc5+ #81
[   36.092344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[   36.092349] Call Trace:
[   36.092351]  <NMI>
[   36.092354]  dump_stack_lvl+0x57/0x81
[   36.092359]  lock_acquire+0x1f4/0x29a
[   36.092363]  ? handle_pmi_common+0x13f/0x1f0
[   36.092366]  ? htab_lock_bucket+0x4d/0x58
[   36.092371]  _raw_spin_lock_irqsave+0x43/0x7f
[   36.092374]  ? htab_lock_bucket+0x4d/0x58
[   36.092377]  htab_lock_bucket+0x4d/0x58
[   36.092379]  htab_map_update_elem+0x11e/0x220
[   36.092386]  bpf_prog_f3a535ca81a8128a_bpf_prog2+0x3e/0x42
[   36.092392]  trace_call_bpf+0x177/0x215
[   36.092398]  perf_trace_run_bpf_submit+0x52/0xaa
[   36.092403]  ? x86_pmu_stop+0x97/0x97
[   36.092407]  perf_trace_nmi_handler+0xb7/0xe0
[   36.092415]  nmi_handle+0x116/0x254
[   36.092418]  ? x86_pmu_stop+0x97/0x97
[   36.092423]  default_do_nmi+0x3d/0xf6
[   36.092428]  exc_nmi+0xa1/0x109
[   36.092432]  end_repeat_nmi+0x16/0x67
[   36.092436] RIP: 0010:wrmsrl+0xd/0x1b
[   36.092441] Code: 04 01 00 00 c6 84 07 48 01 00 00 01 5b e9 46 15 80 00 5b c3 cc cc cc cc c3 cc cc cc cc 48 89 f2 89 f9 89 f0 48 c1 ea 20 0f 30 <66> 90 c3 cc cc cc cc 31 d2 e9 2f 04 49 00 0f 1f 44 00 00 40 0f6
[   36.092443] RSP: 0018:ffffc900043dfc48 EFLAGS: 00000002
[   36.092445] RAX: 000000000000000f RBX: ffff8881b96153e0 RCX: 000000000000038f
[   36.092447] RDX: 0000000000000007 RSI: 000000070000000f RDI: 000000000000038f
[   36.092449] RBP: 000000070000000f R08: ffffffffffffffff R09: ffff8881053bdaa8
[   36.092451] R10: ffff8881b9805d40 R11: 0000000000000005 R12: ffff8881b9805c00
[   36.092452] R13: 0000000000000000 R14: 0000000000000000 R15: ffff8881075ec970
[   36.092460]  ? wrmsrl+0xd/0x1b
[   36.092465]  ? wrmsrl+0xd/0x1b
[   36.092469]  </NMI>
[   36.092469]  <TASK>
[   36.092470]  __intel_pmu_enable_all.constprop.0+0x7c/0xaf
[   36.092475]  event_function+0xb6/0xd3
[   36.092478]  ? cpu_to_node+0x1a/0x1a
[   36.092482]  ? cpu_to_node+0x1a/0x1a
[   36.092485]  remote_function+0x1e/0x4c
[   36.092489]  generic_exec_single+0x48/0xb9
[   36.092492]  ? __lock_acquire+0x666/0x6ed
[   36.092497]  smp_call_function_single+0xbf/0x106
[   36.092499]  ? cpu_to_node+0x1a/0x1a
[   36.092504]  ? kvm_sched_clock_read+0x5/0x11
[   36.092508]  ? __perf_event_task_sched_in+0x13d/0x13d
[   36.092513]  cpu_function_call+0x47/0x69
[   36.092516]  ? perf_event_update_time+0x52/0x52
[   36.092519]  event_function_call+0x89/0x117
[   36.092521]  ? __perf_event_task_sched_in+0x13d/0x13d
[   36.092526]  ? _perf_event_disable+0x4a/0x4a
[   36.092528]  perf_event_for_each_child+0x3d/0x76
[   36.092532]  ? _perf_event_disable+0x4a/0x4a
[   36.092533]  _perf_ioctl+0x564/0x590
[   36.092537]  ? __lock_release+0xd5/0x1b0
[   36.092543]  ? perf_event_ctx_lock_nested+0x8e/0xba
[   36.092547]  perf_ioctl+0x42/0x5f
[   36.092551]  vfs_ioctl+0x1e/0x2f
[   36.092554]  __do_sys_ioctl+0x66/0x89
[   36.092559]  do_syscall_64+0x6d/0x84
[   36.092563]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   36.092566] RIP: 0033:0x7fe7110f362b
[   36.092569] Code: 0f 1e fa 48 8b 05 5d b8 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 2d b8 2c 00 f7 d8 64 89 018
[   36.092570] RSP: 002b:00007ffebb8e4b08 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   36.092573] RAX: ffffffffffffffda RBX: 0000000000002400 RCX: 00007fe7110f362b
[   36.092575] RDX: 0000000000000000 RSI: 0000000000002400 RDI: 0000000000000013
[   36.092576] RBP: 00007ffebb8e4b40 R08: 0000000000000001 R09: 000055c1db4a5b40
[   36.092577] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[   36.092579] R13: 000055c1db3b2a30 R14: 0000000000000000 R15: 0000000000000000
[   36.092586]  </TASK>

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Song Liu <song@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Stanislav Fomichev <sdf@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 kernel/bpf/hashtab.c | 96 +++++++++++++++++---------------------------
 1 file changed, 36 insertions(+), 60 deletions(-)

Comments

Jakub Kicinski Nov. 21, 2022, 8:19 p.m. UTC | #1
On Mon, 21 Nov 2022 18:05:21 +0800 xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> 
> The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
> try to fix deadlock, but in some case, the deadlock occurs:
> 
> * CPUn in task context with K1, and taking lock.
> * CPUn interrupted by NMI context, with K2.
> * They are using the same bucket, but different map_locked.

You should really put bpf@ in the CC line for bpf patches.
Hou Tao Nov. 22, 2022, 1:15 a.m. UTC | #2
Hi,

On 11/21/2022 6:05 PM, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
> try to fix deadlock, but in some case, the deadlock occurs:
>
> * CPUn in task context with K1, and taking lock.
> * CPUn interrupted by NMI context, with K2.
> * They are using the same bucket, but different map_locked.
It is possible when n_buckets is less than HASHTAB_MAP_LOCK_COUNT (e.g.,
n_bucket=4). If using hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) as the
index of map_locked, I think the deadlock will be gone.
>
> 	    | Task
> 	    |
> 	+---v----+
> 	|  CPUn  |
> 	+---^----+
> 	    |
> 	    | NMI
>
> Anyway, the lockdep still warn:
> [   36.092222] ================================
> [   36.092230] WARNING: inconsistent lock state
> [   36.092234] 6.1.0-rc5+ #81 Tainted: G            E
> [   36.092236] --------------------------------
> [   36.092237] inconsistent {INITIAL USE} -> {IN-NMI} usage.
> [   36.092238] perf/1515 [HC1[1]:SC0[0]:HE0:SE1] takes:
> [   36.092242] ffff888341acd1a0 (&htab->lockdep_key){....}-{2:2}, at: htab_lock_bucket+0x4d/0x58
> [   36.092253] {INITIAL USE} state was registered at:
> [   36.092255]   mark_usage+0x1d/0x11d
> [   36.092262]   __lock_acquire+0x3c9/0x6ed
> [   36.092266]   lock_acquire+0x23d/0x29a
> [   36.092270]   _raw_spin_lock_irqsave+0x43/0x7f
> [   36.092274]   htab_lock_bucket+0x4d/0x58
> [   36.092276]   htab_map_delete_elem+0x82/0xfb
> [   36.092278]   map_delete_elem+0x156/0x1ac
> [   36.092282]   __sys_bpf+0x138/0xb71
> [   36.092285]   __do_sys_bpf+0xd/0x15
> [   36.092288]   do_syscall_64+0x6d/0x84
> [   36.092291]   entry_SYSCALL_64_after_hwframe+0x63/0xcd
> [   36.092295] irq event stamp: 120346
> [   36.092296] hardirqs last  enabled at (120345): [<ffffffff8180b97f>] _raw_spin_unlock_irq+0x24/0x39
> [   36.092299] hardirqs last disabled at (120346): [<ffffffff81169e85>] generic_exec_single+0x40/0xb9
> [   36.092303] softirqs last  enabled at (120268): [<ffffffff81c00347>] __do_softirq+0x347/0x387
> [   36.092307] softirqs last disabled at (120133): [<ffffffff810ba4f0>] __irq_exit_rcu+0x67/0xc6
> [   36.092311]
> [   36.092311] other info that might help us debug this:
> [   36.092312]  Possible unsafe locking scenario:
> [   36.092312]
> [   36.092313]        CPU0
> [   36.092313]        ----
> [   36.092314]   lock(&htab->lockdep_key);
> [   36.092315]   <Interrupt>
> [   36.092316]     lock(&htab->lockdep_key);
> [   36.092318]
> [   36.092318]  *** DEADLOCK ***
> [   36.092318]
> [   36.092318] 3 locks held by perf/1515:
> [   36.092320]  #0: ffff8881b9805cc0 (&cpuctx_mutex){+.+.}-{4:4}, at: perf_event_ctx_lock_nested+0x8e/0xba
> [   36.092327]  #1: ffff8881075ecc20 (&event->child_mutex){+.+.}-{4:4}, at: perf_event_for_each_child+0x35/0x76
> [   36.092332]  #2: ffff8881b9805c20 (&cpuctx_lock){-.-.}-{2:2}, at: perf_ctx_lock+0x12/0x27
> [   36.092339]
> [   36.092339] stack backtrace:
> [   36.092341] CPU: 0 PID: 1515 Comm: perf Tainted: G            E      6.1.0-rc5+ #81
> [   36.092344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> [   36.092349] Call Trace:
> [   36.092351]  <NMI>
> [   36.092354]  dump_stack_lvl+0x57/0x81
> [   36.092359]  lock_acquire+0x1f4/0x29a
> [   36.092363]  ? handle_pmi_common+0x13f/0x1f0
> [   36.092366]  ? htab_lock_bucket+0x4d/0x58
> [   36.092371]  _raw_spin_lock_irqsave+0x43/0x7f
> [   36.092374]  ? htab_lock_bucket+0x4d/0x58
> [   36.092377]  htab_lock_bucket+0x4d/0x58
> [   36.092379]  htab_map_update_elem+0x11e/0x220
> [   36.092386]  bpf_prog_f3a535ca81a8128a_bpf_prog2+0x3e/0x42
> [   36.092392]  trace_call_bpf+0x177/0x215
> [   36.092398]  perf_trace_run_bpf_submit+0x52/0xaa
> [   36.092403]  ? x86_pmu_stop+0x97/0x97
> [   36.092407]  perf_trace_nmi_handler+0xb7/0xe0
> [   36.092415]  nmi_handle+0x116/0x254
> [   36.092418]  ? x86_pmu_stop+0x97/0x97
> [   36.092423]  default_do_nmi+0x3d/0xf6
> [   36.092428]  exc_nmi+0xa1/0x109
> [   36.092432]  end_repeat_nmi+0x16/0x67
> [   36.092436] RIP: 0010:wrmsrl+0xd/0x1b
> [   36.092441] Code: 04 01 00 00 c6 84 07 48 01 00 00 01 5b e9 46 15 80 00 5b c3 cc cc cc cc c3 cc cc cc cc 48 89 f2 89 f9 89 f0 48 c1 ea 20 0f 30 <66> 90 c3 cc cc cc cc 31 d2 e9 2f 04 49 00 0f 1f 44 00 00 40 0f6
> [   36.092443] RSP: 0018:ffffc900043dfc48 EFLAGS: 00000002
> [   36.092445] RAX: 000000000000000f RBX: ffff8881b96153e0 RCX: 000000000000038f
> [   36.092447] RDX: 0000000000000007 RSI: 000000070000000f RDI: 000000000000038f
> [   36.092449] RBP: 000000070000000f R08: ffffffffffffffff R09: ffff8881053bdaa8
> [   36.092451] R10: ffff8881b9805d40 R11: 0000000000000005 R12: ffff8881b9805c00
> [   36.092452] R13: 0000000000000000 R14: 0000000000000000 R15: ffff8881075ec970
> [   36.092460]  ? wrmsrl+0xd/0x1b
> [   36.092465]  ? wrmsrl+0xd/0x1b
> [   36.092469]  </NMI>
> [   36.092469]  <TASK>
> [   36.092470]  __intel_pmu_enable_all.constprop.0+0x7c/0xaf
> [   36.092475]  event_function+0xb6/0xd3
> [   36.092478]  ? cpu_to_node+0x1a/0x1a
> [   36.092482]  ? cpu_to_node+0x1a/0x1a
> [   36.092485]  remote_function+0x1e/0x4c
> [   36.092489]  generic_exec_single+0x48/0xb9
> [   36.092492]  ? __lock_acquire+0x666/0x6ed
> [   36.092497]  smp_call_function_single+0xbf/0x106
> [   36.092499]  ? cpu_to_node+0x1a/0x1a
> [   36.092504]  ? kvm_sched_clock_read+0x5/0x11
> [   36.092508]  ? __perf_event_task_sched_in+0x13d/0x13d
> [   36.092513]  cpu_function_call+0x47/0x69
> [   36.092516]  ? perf_event_update_time+0x52/0x52
> [   36.092519]  event_function_call+0x89/0x117
> [   36.092521]  ? __perf_event_task_sched_in+0x13d/0x13d
> [   36.092526]  ? _perf_event_disable+0x4a/0x4a
> [   36.092528]  perf_event_for_each_child+0x3d/0x76
> [   36.092532]  ? _perf_event_disable+0x4a/0x4a
> [   36.092533]  _perf_ioctl+0x564/0x590
> [   36.092537]  ? __lock_release+0xd5/0x1b0
> [   36.092543]  ? perf_event_ctx_lock_nested+0x8e/0xba
> [   36.092547]  perf_ioctl+0x42/0x5f
> [   36.092551]  vfs_ioctl+0x1e/0x2f
> [   36.092554]  __do_sys_ioctl+0x66/0x89
> [   36.092559]  do_syscall_64+0x6d/0x84
> [   36.092563]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> [   36.092566] RIP: 0033:0x7fe7110f362b
> [   36.092569] Code: 0f 1e fa 48 8b 05 5d b8 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 2d b8 2c 00 f7 d8 64 89 018
> [   36.092570] RSP: 002b:00007ffebb8e4b08 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [   36.092573] RAX: ffffffffffffffda RBX: 0000000000002400 RCX: 00007fe7110f362b
> [   36.092575] RDX: 0000000000000000 RSI: 0000000000002400 RDI: 0000000000000013
> [   36.092576] RBP: 00007ffebb8e4b40 R08: 0000000000000001 R09: 000055c1db4a5b40
> [   36.092577] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> [   36.092579] R13: 000055c1db3b2a30 R14: 0000000000000000 R15: 0000000000000000
> [   36.092586]  </TASK>
>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Andrii Nakryiko <andrii@kernel.org>
> Cc: Martin KaFai Lau <martin.lau@linux.dev>
> Cc: Song Liu <song@kernel.org>
> Cc: Yonghong Song <yhs@fb.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: KP Singh <kpsingh@kernel.org>
> Cc: Stanislav Fomichev <sdf@google.com>
> Cc: Hao Luo <haoluo@google.com>
> Cc: Jiri Olsa <jolsa@kernel.org>
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> ---
>  kernel/bpf/hashtab.c | 96 +++++++++++++++++---------------------------
>  1 file changed, 36 insertions(+), 60 deletions(-)
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 22855d6ff6d3..429acd97c869 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -80,9 +80,6 @@ struct bucket {
>  	raw_spinlock_t raw_lock;
>  };
>  
> -#define HASHTAB_MAP_LOCK_COUNT 8
> -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
> -
>  struct bpf_htab {
>  	struct bpf_map map;
>  	struct bpf_mem_alloc ma;
> @@ -104,7 +101,6 @@ struct bpf_htab {
>  	u32 elem_size;	/* size of each element in bytes */
>  	u32 hashrnd;
>  	struct lock_class_key lockdep_key;
> -	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
>  };
>  
>  /* each htab element is struct htab_elem + key + value */
> @@ -146,35 +142,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
>  	}
>  }
>  
> -static inline int htab_lock_bucket(const struct bpf_htab *htab,
> -				   struct bucket *b, u32 hash,
> +static inline int htab_lock_bucket(struct bucket *b,
>  				   unsigned long *pflags)
>  {
>  	unsigned long flags;
>  
> -	hash = hash & HASHTAB_MAP_LOCK_MASK;
> -
> -	preempt_disable();
> -	if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
> -		__this_cpu_dec(*(htab->map_locked[hash]));
> -		preempt_enable();
> -		return -EBUSY;
> +	if (in_nmi()) {
> +		if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
> +			return -EBUSY;
> +	} else {
> +		raw_spin_lock_irqsave(&b->raw_lock, flags);
>  	}
>  
> -	raw_spin_lock_irqsave(&b->raw_lock, flags);
>  	*pflags = flags;
> -
>  	return 0;
>  }
map_locked is also used to prevent the re-entrance of htab_lock_bucket() on the
same CPU, so only check in_nmi() is not enough.
>  
> -static inline void htab_unlock_bucket(const struct bpf_htab *htab,
> -				      struct bucket *b, u32 hash,
> +static inline void htab_unlock_bucket(struct bucket *b,
>  				      unsigned long flags)
>  {
> -	hash = hash & HASHTAB_MAP_LOCK_MASK;
>  	raw_spin_unlock_irqrestore(&b->raw_lock, flags);
> -	__this_cpu_dec(*(htab->map_locked[hash]));
> -	preempt_enable();
>  }
>  
>  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
> @@ -467,7 +454,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
>  	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
>  	struct bpf_htab *htab;
> -	int err, i;
> +	int err;
>  
>  	htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
>  	if (!htab)
> @@ -513,15 +500,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  	if (!htab->buckets)
>  		goto free_htab;
>  
> -	for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
> -		htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
> -							   sizeof(int),
> -							   sizeof(int),
> -							   GFP_USER);
> -		if (!htab->map_locked[i])
> -			goto free_map_locked;
> -	}
> -
>  	if (htab->map.map_flags & BPF_F_ZERO_SEED)
>  		htab->hashrnd = 0;
>  	else
> @@ -549,13 +527,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  	if (htab->use_percpu_counter) {
>  		err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
>  		if (err)
> -			goto free_map_locked;
> +			goto free_buckets;
>  	}
>  
>  	if (prealloc) {
>  		err = prealloc_init(htab);
>  		if (err)
> -			goto free_map_locked;
> +			goto free_buckets;
>  
>  		if (!percpu && !lru) {
>  			/* lru itself can remove the least used element, so
> @@ -568,12 +546,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  	} else {
>  		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
>  		if (err)
> -			goto free_map_locked;
> +			goto free_buckets;
>  		if (percpu) {
>  			err = bpf_mem_alloc_init(&htab->pcpu_ma,
>  						 round_up(htab->map.value_size, 8), true);
>  			if (err)
> -				goto free_map_locked;
> +				goto free_buckets;
>  		}
>  	}
>  
> @@ -581,11 +559,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  
>  free_prealloc:
>  	prealloc_destroy(htab);
> -free_map_locked:
> +free_buckets:
>  	if (htab->use_percpu_counter)
>  		percpu_counter_destroy(&htab->pcount);
> -	for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
> -		free_percpu(htab->map_locked[i]);
> +
>  	bpf_map_area_free(htab->buckets);
>  	bpf_mem_alloc_destroy(&htab->pcpu_ma);
>  	bpf_mem_alloc_destroy(&htab->ma);
> @@ -782,7 +759,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>  	b = __select_bucket(htab, tgt_l->hash);
>  	head = &b->head;
>  
> -	ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
> +	ret = htab_lock_bucket(b, &flags);
>  	if (ret)
>  		return false;
>  
> @@ -793,7 +770,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>  			break;
>  		}
>  
> -	htab_unlock_bucket(htab, b, tgt_l->hash, flags);
> +	htab_unlock_bucket(b, flags);
>  
>  	return l == tgt_l;
>  }
> @@ -1107,7 +1084,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>  		 */
>  	}
>  
> -	ret = htab_lock_bucket(htab, b, hash, &flags);
> +	ret = htab_lock_bucket(b, &flags);
>  	if (ret)
>  		return ret;
>  
> @@ -1152,7 +1129,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	}
>  	ret = 0;
>  err:
> -	htab_unlock_bucket(htab, b, hash, flags);
> +	htab_unlock_bucket(b, flags);
>  	return ret;
>  }
>  
> @@ -1198,7 +1175,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	copy_map_value(&htab->map,
>  		       l_new->key + round_up(map->key_size, 8), value);
>  
> -	ret = htab_lock_bucket(htab, b, hash, &flags);
> +	ret = htab_lock_bucket(b, &flags);
>  	if (ret)
>  		return ret;
>  
> @@ -1219,7 +1196,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	ret = 0;
>  
>  err:
> -	htab_unlock_bucket(htab, b, hash, flags);
> +	htab_unlock_bucket(b, flags);
>  
>  	if (ret)
>  		htab_lru_push_free(htab, l_new);
> @@ -1255,7 +1232,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
>  	b = __select_bucket(htab, hash);
>  	head = &b->head;
>  
> -	ret = htab_lock_bucket(htab, b, hash, &flags);
> +	ret = htab_lock_bucket(b, &flags);
>  	if (ret)
>  		return ret;
>  
> @@ -1280,7 +1257,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
>  	}
>  	ret = 0;
>  err:
> -	htab_unlock_bucket(htab, b, hash, flags);
> +	htab_unlock_bucket(b, flags);
>  	return ret;
>  }
>  
> @@ -1321,7 +1298,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>  			return -ENOMEM;
>  	}
>  
> -	ret = htab_lock_bucket(htab, b, hash, &flags);
> +	ret = htab_lock_bucket(b, &flags);
>  	if (ret)
>  		return ret;
>  
> @@ -1345,7 +1322,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>  	}
>  	ret = 0;
>  err:
> -	htab_unlock_bucket(htab, b, hash, flags);
> +	htab_unlock_bucket(b, flags);
>  	if (l_new)
>  		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
>  	return ret;
> @@ -1384,7 +1361,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
>  	b = __select_bucket(htab, hash);
>  	head = &b->head;
>  
> -	ret = htab_lock_bucket(htab, b, hash, &flags);
> +	ret = htab_lock_bucket(b, &flags);
>  	if (ret)
>  		return ret;
>  
> @@ -1397,7 +1374,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
>  		ret = -ENOENT;
>  	}
>  
> -	htab_unlock_bucket(htab, b, hash, flags);
> +	htab_unlock_bucket(b, flags);
>  	return ret;
>  }
>  
> @@ -1420,7 +1397,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>  	b = __select_bucket(htab, hash);
>  	head = &b->head;
>  
> -	ret = htab_lock_bucket(htab, b, hash, &flags);
> +	ret = htab_lock_bucket(b, &flags);
>  	if (ret)
>  		return ret;
>  
> @@ -1431,7 +1408,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>  	else
>  		ret = -ENOENT;
>  
> -	htab_unlock_bucket(htab, b, hash, flags);
> +	htab_unlock_bucket(b, flags);
>  	if (l)
>  		htab_lru_push_free(htab, l);
>  	return ret;
> @@ -1494,7 +1471,6 @@ static void htab_map_free_timers(struct bpf_map *map)
>  static void htab_map_free(struct bpf_map *map)
>  {
>  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> -	int i;
>  
>  	/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
>  	 * bpf_free_used_maps() is called after bpf prog is no longer executing.
> @@ -1517,10 +1493,10 @@ static void htab_map_free(struct bpf_map *map)
>  	bpf_map_area_free(htab->buckets);
>  	bpf_mem_alloc_destroy(&htab->pcpu_ma);
>  	bpf_mem_alloc_destroy(&htab->ma);
> +
>  	if (htab->use_percpu_counter)
>  		percpu_counter_destroy(&htab->pcount);
> -	for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
> -		free_percpu(htab->map_locked[i]);
> +
>  	lockdep_unregister_key(&htab->lockdep_key);
>  	bpf_map_area_free(htab);
>  }
> @@ -1564,7 +1540,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
>  	b = __select_bucket(htab, hash);
>  	head = &b->head;
>  
> -	ret = htab_lock_bucket(htab, b, hash, &bflags);
> +	ret = htab_lock_bucket(b, &bflags);
>  	if (ret)
>  		return ret;
>  
> @@ -1602,7 +1578,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
>  			free_htab_elem(htab, l);
>  	}
>  
> -	htab_unlock_bucket(htab, b, hash, bflags);
> +	htab_unlock_bucket(b, bflags);
>  
>  	if (is_lru_map && l)
>  		htab_lru_push_free(htab, l);
> @@ -1720,7 +1696,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>  	head = &b->head;
>  	/* do not grab the lock unless need it (bucket_cnt > 0). */
>  	if (locked) {
> -		ret = htab_lock_bucket(htab, b, batch, &flags);
> +		ret = htab_lock_bucket(b, &flags);
>  		if (ret) {
>  			rcu_read_unlock();
>  			bpf_enable_instrumentation();
> @@ -1743,7 +1719,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>  		/* Note that since bucket_cnt > 0 here, it is implicit
>  		 * that the locked was grabbed, so release it.
>  		 */
> -		htab_unlock_bucket(htab, b, batch, flags);
> +		htab_unlock_bucket(b, flags);
>  		rcu_read_unlock();
>  		bpf_enable_instrumentation();
>  		goto after_loop;
> @@ -1754,7 +1730,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>  		/* Note that since bucket_cnt > 0 here, it is implicit
>  		 * that the locked was grabbed, so release it.
>  		 */
> -		htab_unlock_bucket(htab, b, batch, flags);
> +		htab_unlock_bucket(b, flags);
>  		rcu_read_unlock();
>  		bpf_enable_instrumentation();
>  		kvfree(keys);
> @@ -1815,7 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>  		dst_val += value_size;
>  	}
>  
> -	htab_unlock_bucket(htab, b, batch, flags);
> +	htab_unlock_bucket(b, flags);
>  	locked = false;
>  
>  	while (node_to_free) {
Tonghao Zhang Nov. 22, 2022, 3:12 a.m. UTC | #3
.

On Tue, Nov 22, 2022 at 9:16 AM Hou Tao <houtao1@huawei.com> wrote:
>
> Hi,
>
> On 11/21/2022 6:05 PM, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
> > try to fix deadlock, but in some case, the deadlock occurs:
> >
> > * CPUn in task context with K1, and taking lock.
> > * CPUn interrupted by NMI context, with K2.
> > * They are using the same bucket, but different map_locked.
> It is possible when n_buckets is less than HASHTAB_MAP_LOCK_COUNT (e.g.,
> n_bucket=4). If using hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) as the
> index of map_locked, I think the deadlock will be gone.
Yes, but for saving memory, HASHTAB_MAP_LOCK_MASK should not be too
large(now this value is 8-1).
if user define n_bucket ,e.g 8192, the part of bucket only are
selected via hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1).
> >           | Task
> >           |
> >       +---v----+
> >       |  CPUn  |
> >       +---^----+
> >
> >           | NMI
> >
> > Anyway, the lockdep still warn:
> > [   36.092222] ================================
> > [   36.092230] WARNING: inconsistent lock state
> > [   36.092234] 6.1.0-rc5+ #81 Tainted: G            E
> > [   36.092236] --------------------------------
> > [   36.092237] inconsistent {INITIAL USE} -> {IN-NMI} usage.
> > [   36.092238] perf/1515 [HC1[1]:SC0[0]:HE0:SE1] takes:
> > [   36.092242] ffff888341acd1a0 (&htab->lockdep_key){....}-{2:2}, at: htab_lock_bucket+0x4d/0x58
> > [   36.092253] {INITIAL USE} state was registered at:
> > [   36.092255]   mark_usage+0x1d/0x11d
> > [   36.092262]   __lock_acquire+0x3c9/0x6ed
> > [   36.092266]   lock_acquire+0x23d/0x29a
> > [   36.092270]   _raw_spin_lock_irqsave+0x43/0x7f
> > [   36.092274]   htab_lock_bucket+0x4d/0x58
> > [   36.092276]   htab_map_delete_elem+0x82/0xfb
> > [   36.092278]   map_delete_elem+0x156/0x1ac
> > [   36.092282]   __sys_bpf+0x138/0xb71
> > [   36.092285]   __do_sys_bpf+0xd/0x15
> > [   36.092288]   do_syscall_64+0x6d/0x84
> > [   36.092291]   entry_SYSCALL_64_after_hwframe+0x63/0xcd
> > [   36.092295] irq event stamp: 120346
> > [   36.092296] hardirqs last  enabled at (120345): [<ffffffff8180b97f>] _raw_spin_unlock_irq+0x24/0x39
> > [   36.092299] hardirqs last disabled at (120346): [<ffffffff81169e85>] generic_exec_single+0x40/0xb9
> > [   36.092303] softirqs last  enabled at (120268): [<ffffffff81c00347>] __do_softirq+0x347/0x387
> > [   36.092307] softirqs last disabled at (120133): [<ffffffff810ba4f0>] __irq_exit_rcu+0x67/0xc6
> > [   36.092311]
> > [   36.092311] other info that might help us debug this:
> > [   36.092312]  Possible unsafe locking scenario:
> > [   36.092312]
> > [   36.092313]        CPU0
> > [   36.092313]        ----
> > [   36.092314]   lock(&htab->lockdep_key);
> > [   36.092315]   <Interrupt>
> > [   36.092316]     lock(&htab->lockdep_key);
> > [   36.092318]
> > [   36.092318]  *** DEADLOCK ***
> > [   36.092318]
> > [   36.092318] 3 locks held by perf/1515:
> > [   36.092320]  #0: ffff8881b9805cc0 (&cpuctx_mutex){+.+.}-{4:4}, at: perf_event_ctx_lock_nested+0x8e/0xba
> > [   36.092327]  #1: ffff8881075ecc20 (&event->child_mutex){+.+.}-{4:4}, at: perf_event_for_each_child+0x35/0x76
> > [   36.092332]  #2: ffff8881b9805c20 (&cpuctx_lock){-.-.}-{2:2}, at: perf_ctx_lock+0x12/0x27
> > [   36.092339]
> > [   36.092339] stack backtrace:
> > [   36.092341] CPU: 0 PID: 1515 Comm: perf Tainted: G            E      6.1.0-rc5+ #81
> > [   36.092344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> > [   36.092349] Call Trace:
> > [   36.092351]  <NMI>
> > [   36.092354]  dump_stack_lvl+0x57/0x81
> > [   36.092359]  lock_acquire+0x1f4/0x29a
> > [   36.092363]  ? handle_pmi_common+0x13f/0x1f0
> > [   36.092366]  ? htab_lock_bucket+0x4d/0x58
> > [   36.092371]  _raw_spin_lock_irqsave+0x43/0x7f
> > [   36.092374]  ? htab_lock_bucket+0x4d/0x58
> > [   36.092377]  htab_lock_bucket+0x4d/0x58
> > [   36.092379]  htab_map_update_elem+0x11e/0x220
> > [   36.092386]  bpf_prog_f3a535ca81a8128a_bpf_prog2+0x3e/0x42
> > [   36.092392]  trace_call_bpf+0x177/0x215
> > [   36.092398]  perf_trace_run_bpf_submit+0x52/0xaa
> > [   36.092403]  ? x86_pmu_stop+0x97/0x97
> > [   36.092407]  perf_trace_nmi_handler+0xb7/0xe0
> > [   36.092415]  nmi_handle+0x116/0x254
> > [   36.092418]  ? x86_pmu_stop+0x97/0x97
> > [   36.092423]  default_do_nmi+0x3d/0xf6
> > [   36.092428]  exc_nmi+0xa1/0x109
> > [   36.092432]  end_repeat_nmi+0x16/0x67
> > [   36.092436] RIP: 0010:wrmsrl+0xd/0x1b
> > [   36.092441] Code: 04 01 00 00 c6 84 07 48 01 00 00 01 5b e9 46 15 80 00 5b c3 cc cc cc cc c3 cc cc cc cc 48 89 f2 89 f9 89 f0 48 c1 ea 20 0f 30 <66> 90 c3 cc cc cc cc 31 d2 e9 2f 04 49 00 0f 1f 44 00 00 40 0f6
> > [   36.092443] RSP: 0018:ffffc900043dfc48 EFLAGS: 00000002
> > [   36.092445] RAX: 000000000000000f RBX: ffff8881b96153e0 RCX: 000000000000038f
> > [   36.092447] RDX: 0000000000000007 RSI: 000000070000000f RDI: 000000000000038f
> > [   36.092449] RBP: 000000070000000f R08: ffffffffffffffff R09: ffff8881053bdaa8
> > [   36.092451] R10: ffff8881b9805d40 R11: 0000000000000005 R12: ffff8881b9805c00
> > [   36.092452] R13: 0000000000000000 R14: 0000000000000000 R15: ffff8881075ec970
> > [   36.092460]  ? wrmsrl+0xd/0x1b
> > [   36.092465]  ? wrmsrl+0xd/0x1b
> > [   36.092469]  </NMI>
> > [   36.092469]  <TASK>
> > [   36.092470]  __intel_pmu_enable_all.constprop.0+0x7c/0xaf
> > [   36.092475]  event_function+0xb6/0xd3
> > [   36.092478]  ? cpu_to_node+0x1a/0x1a
> > [   36.092482]  ? cpu_to_node+0x1a/0x1a
> > [   36.092485]  remote_function+0x1e/0x4c
> > [   36.092489]  generic_exec_single+0x48/0xb9
> > [   36.092492]  ? __lock_acquire+0x666/0x6ed
> > [   36.092497]  smp_call_function_single+0xbf/0x106
> > [   36.092499]  ? cpu_to_node+0x1a/0x1a
> > [   36.092504]  ? kvm_sched_clock_read+0x5/0x11
> > [   36.092508]  ? __perf_event_task_sched_in+0x13d/0x13d
> > [   36.092513]  cpu_function_call+0x47/0x69
> > [   36.092516]  ? perf_event_update_time+0x52/0x52
> > [   36.092519]  event_function_call+0x89/0x117
> > [   36.092521]  ? __perf_event_task_sched_in+0x13d/0x13d
> > [   36.092526]  ? _perf_event_disable+0x4a/0x4a
> > [   36.092528]  perf_event_for_each_child+0x3d/0x76
> > [   36.092532]  ? _perf_event_disable+0x4a/0x4a
> > [   36.092533]  _perf_ioctl+0x564/0x590
> > [   36.092537]  ? __lock_release+0xd5/0x1b0
> > [   36.092543]  ? perf_event_ctx_lock_nested+0x8e/0xba
> > [   36.092547]  perf_ioctl+0x42/0x5f
> > [   36.092551]  vfs_ioctl+0x1e/0x2f
> > [   36.092554]  __do_sys_ioctl+0x66/0x89
> > [   36.092559]  do_syscall_64+0x6d/0x84
> > [   36.092563]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> > [   36.092566] RIP: 0033:0x7fe7110f362b
> > [   36.092569] Code: 0f 1e fa 48 8b 05 5d b8 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 2d b8 2c 00 f7 d8 64 89 018
> > [   36.092570] RSP: 002b:00007ffebb8e4b08 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> > [   36.092573] RAX: ffffffffffffffda RBX: 0000000000002400 RCX: 00007fe7110f362b
> > [   36.092575] RDX: 0000000000000000 RSI: 0000000000002400 RDI: 0000000000000013
> > [   36.092576] RBP: 00007ffebb8e4b40 R08: 0000000000000001 R09: 000055c1db4a5b40
> > [   36.092577] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> > [   36.092579] R13: 000055c1db3b2a30 R14: 0000000000000000 R15: 0000000000000000
> > [   36.092586]  </TASK>
> >
> > Cc: Alexei Starovoitov <ast@kernel.org>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Andrii Nakryiko <andrii@kernel.org>
> > Cc: Martin KaFai Lau <martin.lau@linux.dev>
> > Cc: Song Liu <song@kernel.org>
> > Cc: Yonghong Song <yhs@fb.com>
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: KP Singh <kpsingh@kernel.org>
> > Cc: Stanislav Fomichev <sdf@google.com>
> > Cc: Hao Luo <haoluo@google.com>
> > Cc: Jiri Olsa <jolsa@kernel.org>
> > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > ---
> >  kernel/bpf/hashtab.c | 96 +++++++++++++++++---------------------------
> >  1 file changed, 36 insertions(+), 60 deletions(-)
> >
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index 22855d6ff6d3..429acd97c869 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -80,9 +80,6 @@ struct bucket {
> >       raw_spinlock_t raw_lock;
> >  };
> >
> > -#define HASHTAB_MAP_LOCK_COUNT 8
> > -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
> > -
> >  struct bpf_htab {
> >       struct bpf_map map;
> >       struct bpf_mem_alloc ma;
> > @@ -104,7 +101,6 @@ struct bpf_htab {
> >       u32 elem_size;  /* size of each element in bytes */
> >       u32 hashrnd;
> >       struct lock_class_key lockdep_key;
> > -     int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> >  };
> >
> >  /* each htab element is struct htab_elem + key + value */
> > @@ -146,35 +142,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
> >       }
> >  }
> >
> > -static inline int htab_lock_bucket(const struct bpf_htab *htab,
> > -                                struct bucket *b, u32 hash,
> > +static inline int htab_lock_bucket(struct bucket *b,
> >                                  unsigned long *pflags)
> >  {
> >       unsigned long flags;
> >
> > -     hash = hash & HASHTAB_MAP_LOCK_MASK;
> > -
> > -     preempt_disable();
> > -     if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
> > -             __this_cpu_dec(*(htab->map_locked[hash]));
> > -             preempt_enable();
> > -             return -EBUSY;
> > +     if (in_nmi()) {
> > +             if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
> > +                     return -EBUSY;
> > +     } else {
> > +             raw_spin_lock_irqsave(&b->raw_lock, flags);
> >       }
> >
> > -     raw_spin_lock_irqsave(&b->raw_lock, flags);
> >       *pflags = flags;
> > -
> >       return 0;
> >  }
> map_locked is also used to prevent the re-entrance of htab_lock_bucket() on the
> same CPU, so only check in_nmi() is not enough.
NMI, IRQ, and preempt may interrupt the task context.
In htab_lock_bucket, raw_spin_lock_irqsave disable the preempt and
irq. so only NMI may interrupt the codes, right ?

> > -static inline void htab_unlock_bucket(const struct bpf_htab *htab,
> > -                                   struct bucket *b, u32 hash,
> > +static inline void htab_unlock_bucket(struct bucket *b,
> >                                     unsigned long flags)
> >  {
> > -     hash = hash & HASHTAB_MAP_LOCK_MASK;
> >       raw_spin_unlock_irqrestore(&b->raw_lock, flags);
> > -     __this_cpu_dec(*(htab->map_locked[hash]));
> > -     preempt_enable();
> >  }
> >
> >  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
> > @@ -467,7 +454,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >       bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
> >       bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
> >       struct bpf_htab *htab;
> > -     int err, i;
> > +     int err;
> >
> >       htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
> >       if (!htab)
> > @@ -513,15 +500,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >       if (!htab->buckets)
> >               goto free_htab;
> >
> > -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
> > -             htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
> > -                                                        sizeof(int),
> > -                                                        sizeof(int),
> > -                                                        GFP_USER);
> > -             if (!htab->map_locked[i])
> > -                     goto free_map_locked;
> > -     }
> > -
> >       if (htab->map.map_flags & BPF_F_ZERO_SEED)
> >               htab->hashrnd = 0;
> >       else
> > @@ -549,13 +527,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >       if (htab->use_percpu_counter) {
> >               err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
> >               if (err)
> > -                     goto free_map_locked;
> > +                     goto free_buckets;
> >       }
> >
> >       if (prealloc) {
> >               err = prealloc_init(htab);
> >               if (err)
> > -                     goto free_map_locked;
> > +                     goto free_buckets;
> >
> >               if (!percpu && !lru) {
> >                       /* lru itself can remove the least used element, so
> > @@ -568,12 +546,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >       } else {
> >               err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
> >               if (err)
> > -                     goto free_map_locked;
> > +                     goto free_buckets;
> >               if (percpu) {
> >                       err = bpf_mem_alloc_init(&htab->pcpu_ma,
> >                                                round_up(htab->map.value_size, 8), true);
> >                       if (err)
> > -                             goto free_map_locked;
> > +                             goto free_buckets;
> >               }
> >       }
> >
> > @@ -581,11 +559,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >
> >  free_prealloc:
> >       prealloc_destroy(htab);
> > -free_map_locked:
> > +free_buckets:
> >       if (htab->use_percpu_counter)
> >               percpu_counter_destroy(&htab->pcount);
> > -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
> > -             free_percpu(htab->map_locked[i]);
> > +
> >       bpf_map_area_free(htab->buckets);
> >       bpf_mem_alloc_destroy(&htab->pcpu_ma);
> >       bpf_mem_alloc_destroy(&htab->ma);
> > @@ -782,7 +759,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >       b = __select_bucket(htab, tgt_l->hash);
> >       head = &b->head;
> >
> > -     ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
> > +     ret = htab_lock_bucket(b, &flags);
> >       if (ret)
> >               return false;
> >
> > @@ -793,7 +770,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >                       break;
> >               }
> >
> > -     htab_unlock_bucket(htab, b, tgt_l->hash, flags);
> > +     htab_unlock_bucket(b, flags);
> >
> >       return l == tgt_l;
> >  }
> > @@ -1107,7 +1084,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> >                */
> >       }
> >
> > -     ret = htab_lock_bucket(htab, b, hash, &flags);
> > +     ret = htab_lock_bucket(b, &flags);
> >       if (ret)
> >               return ret;
> >
> > @@ -1152,7 +1129,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> >       }
> >       ret = 0;
> >  err:
> > -     htab_unlock_bucket(htab, b, hash, flags);
> > +     htab_unlock_bucket(b, flags);
> >       return ret;
> >  }
> >
> > @@ -1198,7 +1175,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
> >       copy_map_value(&htab->map,
> >                      l_new->key + round_up(map->key_size, 8), value);
> >
> > -     ret = htab_lock_bucket(htab, b, hash, &flags);
> > +     ret = htab_lock_bucket(b, &flags);
> >       if (ret)
> >               return ret;
> >
> > @@ -1219,7 +1196,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
> >       ret = 0;
> >
> >  err:
> > -     htab_unlock_bucket(htab, b, hash, flags);
> > +     htab_unlock_bucket(b, flags);
> >
> >       if (ret)
> >               htab_lru_push_free(htab, l_new);
> > @@ -1255,7 +1232,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
> >       b = __select_bucket(htab, hash);
> >       head = &b->head;
> >
> > -     ret = htab_lock_bucket(htab, b, hash, &flags);
> > +     ret = htab_lock_bucket(b, &flags);
> >       if (ret)
> >               return ret;
> >
> > @@ -1280,7 +1257,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
> >       }
> >       ret = 0;
> >  err:
> > -     htab_unlock_bucket(htab, b, hash, flags);
> > +     htab_unlock_bucket(b, flags);
> >       return ret;
> >  }
> >
> > @@ -1321,7 +1298,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
> >                       return -ENOMEM;
> >       }
> >
> > -     ret = htab_lock_bucket(htab, b, hash, &flags);
> > +     ret = htab_lock_bucket(b, &flags);
> >       if (ret)
> >               return ret;
> >
> > @@ -1345,7 +1322,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
> >       }
> >       ret = 0;
> >  err:
> > -     htab_unlock_bucket(htab, b, hash, flags);
> > +     htab_unlock_bucket(b, flags);
> >       if (l_new)
> >               bpf_lru_push_free(&htab->lru, &l_new->lru_node);
> >       return ret;
> > @@ -1384,7 +1361,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
> >       b = __select_bucket(htab, hash);
> >       head = &b->head;
> >
> > -     ret = htab_lock_bucket(htab, b, hash, &flags);
> > +     ret = htab_lock_bucket(b, &flags);
> >       if (ret)
> >               return ret;
> >
> > @@ -1397,7 +1374,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
> >               ret = -ENOENT;
> >       }
> >
> > -     htab_unlock_bucket(htab, b, hash, flags);
> > +     htab_unlock_bucket(b, flags);
> >       return ret;
> >  }
> >
> > @@ -1420,7 +1397,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
> >       b = __select_bucket(htab, hash);
> >       head = &b->head;
> >
> > -     ret = htab_lock_bucket(htab, b, hash, &flags);
> > +     ret = htab_lock_bucket(b, &flags);
> >       if (ret)
> >               return ret;
> >
> > @@ -1431,7 +1408,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
> >       else
> >               ret = -ENOENT;
> >
> > -     htab_unlock_bucket(htab, b, hash, flags);
> > +     htab_unlock_bucket(b, flags);
> >       if (l)
> >               htab_lru_push_free(htab, l);
> >       return ret;
> > @@ -1494,7 +1471,6 @@ static void htab_map_free_timers(struct bpf_map *map)
> >  static void htab_map_free(struct bpf_map *map)
> >  {
> >       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> > -     int i;
> >
> >       /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
> >        * bpf_free_used_maps() is called after bpf prog is no longer executing.
> > @@ -1517,10 +1493,10 @@ static void htab_map_free(struct bpf_map *map)
> >       bpf_map_area_free(htab->buckets);
> >       bpf_mem_alloc_destroy(&htab->pcpu_ma);
> >       bpf_mem_alloc_destroy(&htab->ma);
> > +
> >       if (htab->use_percpu_counter)
> >               percpu_counter_destroy(&htab->pcount);
> > -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
> > -             free_percpu(htab->map_locked[i]);
> > +
> >       lockdep_unregister_key(&htab->lockdep_key);
> >       bpf_map_area_free(htab);
> >  }
> > @@ -1564,7 +1540,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
> >       b = __select_bucket(htab, hash);
> >       head = &b->head;
> >
> > -     ret = htab_lock_bucket(htab, b, hash, &bflags);
> > +     ret = htab_lock_bucket(b, &bflags);
> >       if (ret)
> >               return ret;
> >
> > @@ -1602,7 +1578,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
> >                       free_htab_elem(htab, l);
> >       }
> >
> > -     htab_unlock_bucket(htab, b, hash, bflags);
> > +     htab_unlock_bucket(b, bflags);
> >
> >       if (is_lru_map && l)
> >               htab_lru_push_free(htab, l);
> > @@ -1720,7 +1696,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >       head = &b->head;
> >       /* do not grab the lock unless need it (bucket_cnt > 0). */
> >       if (locked) {
> > -             ret = htab_lock_bucket(htab, b, batch, &flags);
> > +             ret = htab_lock_bucket(b, &flags);
> >               if (ret) {
> >                       rcu_read_unlock();
> >                       bpf_enable_instrumentation();
> > @@ -1743,7 +1719,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >               /* Note that since bucket_cnt > 0 here, it is implicit
> >                * that the locked was grabbed, so release it.
> >                */
> > -             htab_unlock_bucket(htab, b, batch, flags);
> > +             htab_unlock_bucket(b, flags);
> >               rcu_read_unlock();
> >               bpf_enable_instrumentation();
> >               goto after_loop;
> > @@ -1754,7 +1730,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >               /* Note that since bucket_cnt > 0 here, it is implicit
> >                * that the locked was grabbed, so release it.
> >                */
> > -             htab_unlock_bucket(htab, b, batch, flags);
> > +             htab_unlock_bucket(b, flags);
> >               rcu_read_unlock();
> >               bpf_enable_instrumentation();
> >               kvfree(keys);
> > @@ -1815,7 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >               dst_val += value_size;
> >       }
> >
> > -     htab_unlock_bucket(htab, b, batch, flags);
> > +     htab_unlock_bucket(b, flags);
> >       locked = false;
> >
> >       while (node_to_free) {
>
Hou Tao Nov. 22, 2022, 4:01 a.m. UTC | #4
Hi,

On 11/22/2022 11:12 AM, Tonghao Zhang wrote:
> .
>
> On Tue, Nov 22, 2022 at 9:16 AM Hou Tao <houtao1@huawei.com> wrote:
>> Hi,
>>
>> On 11/21/2022 6:05 PM, xiangxia.m.yue@gmail.com wrote:
>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>
>>> The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
>>> try to fix deadlock, but in some case, the deadlock occurs:
>>>
>>> * CPUn in task context with K1, and taking lock.
>>> * CPUn interrupted by NMI context, with K2.
>>> * They are using the same bucket, but different map_locked.
>> It is possible when n_buckets is less than HASHTAB_MAP_LOCK_COUNT (e.g.,
>> n_bucket=4). If using hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) as the
>> index of map_locked, I think the deadlock will be gone.
> Yes, but for saving memory, HASHTAB_MAP_LOCK_MASK should not be too
> large(now this value is 8-1).
> if user define n_bucket ,e.g 8192, the part of bucket only are
> selected via hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1).
SNIP
>
>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>> index 22855d6ff6d3..429acd97c869 100644
>>> --- a/kernel/bpf/hashtab.c
>>> +++ b/kernel/bpf/hashtab.c
>>> @@ -80,9 +80,6 @@ struct bucket {
>>>       raw_spinlock_t raw_lock;
>>>  };
>>>
>>> -#define HASHTAB_MAP_LOCK_COUNT 8
>>> -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
>>> -
>>>  struct bpf_htab {
>>>       struct bpf_map map;
>>>       struct bpf_mem_alloc ma;
>>> @@ -104,7 +101,6 @@ struct bpf_htab {
>>>       u32 elem_size;  /* size of each element in bytes */
>>>       u32 hashrnd;
>>>       struct lock_class_key lockdep_key;
>>> -     int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
>>>  };
>>>
>>>  /* each htab element is struct htab_elem + key + value */
>>> @@ -146,35 +142,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
>>>       }
>>>  }
>>>
>>> -static inline int htab_lock_bucket(const struct bpf_htab *htab,
>>> -                                struct bucket *b, u32 hash,
>>> +static inline int htab_lock_bucket(struct bucket *b,
>>>                                  unsigned long *pflags)
>>>  {
>>>       unsigned long flags;
>>>
>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
>>> -
>>> -     preempt_disable();
>>> -     if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
>>> -             __this_cpu_dec(*(htab->map_locked[hash]));
>>> -             preempt_enable();
>>> -             return -EBUSY;
>>> +     if (in_nmi()) {
>>> +             if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>>> +                     return -EBUSY;
>>> +     } else {
>>> +             raw_spin_lock_irqsave(&b->raw_lock, flags);
>>>       }
>>>
>>> -     raw_spin_lock_irqsave(&b->raw_lock, flags);
>>>       *pflags = flags;
>>> -
>>>       return 0;
>>>  }
>> map_locked is also used to prevent the re-entrance of htab_lock_bucket() on the
>> same CPU, so only check in_nmi() is not enough.
> NMI, IRQ, and preempt may interrupt the task context.
> In htab_lock_bucket, raw_spin_lock_irqsave disable the preempt and
> irq. so only NMI may interrupt the codes, right ?
The re-entrance here means the nesting of bpf programs as show below:

bpf_prog A
update map X
    htab_lock_bucket
        raw_spin_lock_irqsave()
    lookup_elem_raw()
        // bpf prog B is attached on lookup_elem_raw()
        bpf prog B
            update map X again and update the element
                htab_lock_bucket()
                    // dead-lock
                    raw_spinlock_irqsave()
.

>>> -static inline void htab_unlock_bucket(const struct bpf_htab *htab,
>>> -                                   struct bucket *b, u32 hash,
>>> +static inline void htab_unlock_bucket(struct bucket *b,
>>>                                     unsigned long flags)
>>>  {
>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
>>>       raw_spin_unlock_irqrestore(&b->raw_lock, flags);
>>> -     __this_cpu_dec(*(htab->map_locked[hash]));
>>> -     preempt_enable();
>>>  }
>>>
>>>  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
>>> @@ -467,7 +454,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>       bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
>>>       bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
>>>       struct bpf_htab *htab;
>>> -     int err, i;
>>> +     int err;
>>>
>>>       htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
>>>       if (!htab)
>>> @@ -513,15 +500,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>       if (!htab->buckets)
>>>               goto free_htab;
>>>
>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
>>> -             htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
>>> -                                                        sizeof(int),
>>> -                                                        sizeof(int),
>>> -                                                        GFP_USER);
>>> -             if (!htab->map_locked[i])
>>> -                     goto free_map_locked;
>>> -     }
>>> -
>>>       if (htab->map.map_flags & BPF_F_ZERO_SEED)
>>>               htab->hashrnd = 0;
>>>       else
>>> @@ -549,13 +527,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>       if (htab->use_percpu_counter) {
>>>               err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
>>>               if (err)
>>> -                     goto free_map_locked;
>>> +                     goto free_buckets;
>>>       }
>>>
>>>       if (prealloc) {
>>>               err = prealloc_init(htab);
>>>               if (err)
>>> -                     goto free_map_locked;
>>> +                     goto free_buckets;
>>>
>>>               if (!percpu && !lru) {
>>>                       /* lru itself can remove the least used element, so
>>> @@ -568,12 +546,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>       } else {
>>>               err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
>>>               if (err)
>>> -                     goto free_map_locked;
>>> +                     goto free_buckets;
>>>               if (percpu) {
>>>                       err = bpf_mem_alloc_init(&htab->pcpu_ma,
>>>                                                round_up(htab->map.value_size, 8), true);
>>>                       if (err)
>>> -                             goto free_map_locked;
>>> +                             goto free_buckets;
>>>               }
>>>       }
>>>
>>> @@ -581,11 +559,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>
>>>  free_prealloc:
>>>       prealloc_destroy(htab);
>>> -free_map_locked:
>>> +free_buckets:
>>>       if (htab->use_percpu_counter)
>>>               percpu_counter_destroy(&htab->pcount);
>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
>>> -             free_percpu(htab->map_locked[i]);
>>> +
>>>       bpf_map_area_free(htab->buckets);
>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
>>>       bpf_mem_alloc_destroy(&htab->ma);
>>> @@ -782,7 +759,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>>>       b = __select_bucket(htab, tgt_l->hash);
>>>       head = &b->head;
>>>
>>> -     ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
>>> +     ret = htab_lock_bucket(b, &flags);
>>>       if (ret)
>>>               return false;
>>>
>>> @@ -793,7 +770,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>>>                       break;
>>>               }
>>>
>>> -     htab_unlock_bucket(htab, b, tgt_l->hash, flags);
>>> +     htab_unlock_bucket(b, flags);
>>>
>>>       return l == tgt_l;
>>>  }
>>> @@ -1107,7 +1084,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>                */
>>>       }
>>>
>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>> +     ret = htab_lock_bucket(b, &flags);
>>>       if (ret)
>>>               return ret;
>>>
>>> @@ -1152,7 +1129,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>       }
>>>       ret = 0;
>>>  err:
>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>> +     htab_unlock_bucket(b, flags);
>>>       return ret;
>>>  }
>>>
>>> @@ -1198,7 +1175,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>       copy_map_value(&htab->map,
>>>                      l_new->key + round_up(map->key_size, 8), value);
>>>
>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>> +     ret = htab_lock_bucket(b, &flags);
>>>       if (ret)
>>>               return ret;
>>>
>>> @@ -1219,7 +1196,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>       ret = 0;
>>>
>>>  err:
>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>> +     htab_unlock_bucket(b, flags);
>>>
>>>       if (ret)
>>>               htab_lru_push_free(htab, l_new);
>>> @@ -1255,7 +1232,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>       b = __select_bucket(htab, hash);
>>>       head = &b->head;
>>>
>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>> +     ret = htab_lock_bucket(b, &flags);
>>>       if (ret)
>>>               return ret;
>>>
>>> @@ -1280,7 +1257,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>       }
>>>       ret = 0;
>>>  err:
>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>> +     htab_unlock_bucket(b, flags);
>>>       return ret;
>>>  }
>>>
>>> @@ -1321,7 +1298,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>                       return -ENOMEM;
>>>       }
>>>
>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>> +     ret = htab_lock_bucket(b, &flags);
>>>       if (ret)
>>>               return ret;
>>>
>>> @@ -1345,7 +1322,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>       }
>>>       ret = 0;
>>>  err:
>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>> +     htab_unlock_bucket(b, flags);
>>>       if (l_new)
>>>               bpf_lru_push_free(&htab->lru, &l_new->lru_node);
>>>       return ret;
>>> @@ -1384,7 +1361,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
>>>       b = __select_bucket(htab, hash);
>>>       head = &b->head;
>>>
>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>> +     ret = htab_lock_bucket(b, &flags);
>>>       if (ret)
>>>               return ret;
>>>
>>> @@ -1397,7 +1374,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
>>>               ret = -ENOENT;
>>>       }
>>>
>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>> +     htab_unlock_bucket(b, flags);
>>>       return ret;
>>>  }
>>>
>>> @@ -1420,7 +1397,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>>>       b = __select_bucket(htab, hash);
>>>       head = &b->head;
>>>
>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>> +     ret = htab_lock_bucket(b, &flags);
>>>       if (ret)
>>>               return ret;
>>>
>>> @@ -1431,7 +1408,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>>>       else
>>>               ret = -ENOENT;
>>>
>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>> +     htab_unlock_bucket(b, flags);
>>>       if (l)
>>>               htab_lru_push_free(htab, l);
>>>       return ret;
>>> @@ -1494,7 +1471,6 @@ static void htab_map_free_timers(struct bpf_map *map)
>>>  static void htab_map_free(struct bpf_map *map)
>>>  {
>>>       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
>>> -     int i;
>>>
>>>       /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
>>>        * bpf_free_used_maps() is called after bpf prog is no longer executing.
>>> @@ -1517,10 +1493,10 @@ static void htab_map_free(struct bpf_map *map)
>>>       bpf_map_area_free(htab->buckets);
>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
>>>       bpf_mem_alloc_destroy(&htab->ma);
>>> +
>>>       if (htab->use_percpu_counter)
>>>               percpu_counter_destroy(&htab->pcount);
>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
>>> -             free_percpu(htab->map_locked[i]);
>>> +
>>>       lockdep_unregister_key(&htab->lockdep_key);
>>>       bpf_map_area_free(htab);
>>>  }
>>> @@ -1564,7 +1540,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
>>>       b = __select_bucket(htab, hash);
>>>       head = &b->head;
>>>
>>> -     ret = htab_lock_bucket(htab, b, hash, &bflags);
>>> +     ret = htab_lock_bucket(b, &bflags);
>>>       if (ret)
>>>               return ret;
>>>
>>> @@ -1602,7 +1578,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
>>>                       free_htab_elem(htab, l);
>>>       }
>>>
>>> -     htab_unlock_bucket(htab, b, hash, bflags);
>>> +     htab_unlock_bucket(b, bflags);
>>>
>>>       if (is_lru_map && l)
>>>               htab_lru_push_free(htab, l);
>>> @@ -1720,7 +1696,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>       head = &b->head;
>>>       /* do not grab the lock unless need it (bucket_cnt > 0). */
>>>       if (locked) {
>>> -             ret = htab_lock_bucket(htab, b, batch, &flags);
>>> +             ret = htab_lock_bucket(b, &flags);
>>>               if (ret) {
>>>                       rcu_read_unlock();
>>>                       bpf_enable_instrumentation();
>>> @@ -1743,7 +1719,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>               /* Note that since bucket_cnt > 0 here, it is implicit
>>>                * that the locked was grabbed, so release it.
>>>                */
>>> -             htab_unlock_bucket(htab, b, batch, flags);
>>> +             htab_unlock_bucket(b, flags);
>>>               rcu_read_unlock();
>>>               bpf_enable_instrumentation();
>>>               goto after_loop;
>>> @@ -1754,7 +1730,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>               /* Note that since bucket_cnt > 0 here, it is implicit
>>>                * that the locked was grabbed, so release it.
>>>                */
>>> -             htab_unlock_bucket(htab, b, batch, flags);
>>> +             htab_unlock_bucket(b, flags);
>>>               rcu_read_unlock();
>>>               bpf_enable_instrumentation();
>>>               kvfree(keys);
>>> @@ -1815,7 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>               dst_val += value_size;
>>>       }
>>>
>>> -     htab_unlock_bucket(htab, b, batch, flags);
>>> +     htab_unlock_bucket(b, flags);
>>>       locked = false;
>>>
>>>       while (node_to_free) {
>
Hou Tao Nov. 22, 2022, 4:06 a.m. UTC | #5
Hi,

On 11/22/2022 12:01 PM, Hou Tao wrote:
> Hi,
>
> On 11/22/2022 11:12 AM, Tonghao Zhang wrote:
>> .
>>
>> On Tue, Nov 22, 2022 at 9:16 AM Hou Tao <houtao1@huawei.com> wrote:
>>> Hi,
>>>
>>> On 11/21/2022 6:05 PM, xiangxia.m.yue@gmail.com wrote:
>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>>
>>>> The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
>>>> try to fix deadlock, but in some case, the deadlock occurs:
>>>>
>>>> * CPUn in task context with K1, and taking lock.
>>>> * CPUn interrupted by NMI context, with K2.
>>>> * They are using the same bucket, but different map_locked.
>>> It is possible when n_buckets is less than HASHTAB_MAP_LOCK_COUNT (e.g.,
>>> n_bucket=4). If using hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) as the
>>> index of map_locked, I think the deadlock will be gone.
>> Yes, but for saving memory, HASHTAB_MAP_LOCK_MASK should not be too
>> large(now this value is 8-1).
>> if user define n_bucket ,e.g 8192, the part of bucket only are
>> selected via hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1).
I don't mean to extend map_locked. Using hash & min(HASHTAB_MAP_LOCK_MASK,
n_bucket - 1) as index of map_locked  can guarantee the same map_locked will be
used if different update processes are using the same bucket lock.
> SNIP
>>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>>> index 22855d6ff6d3..429acd97c869 100644
>>>> --- a/kernel/bpf/hashtab.c
>>>> +++ b/kernel/bpf/hashtab.c
>>>> @@ -80,9 +80,6 @@ struct bucket {
>>>>       raw_spinlock_t raw_lock;
>>>>  };
>>>>
>>>> -#define HASHTAB_MAP_LOCK_COUNT 8
>>>> -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
>>>> -
>>>>  struct bpf_htab {
>>>>       struct bpf_map map;
>>>>       struct bpf_mem_alloc ma;
>>>> @@ -104,7 +101,6 @@ struct bpf_htab {
>>>>       u32 elem_size;  /* size of each element in bytes */
>>>>       u32 hashrnd;
>>>>       struct lock_class_key lockdep_key;
>>>> -     int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
>>>>  };
>>>>
>>>>  /* each htab element is struct htab_elem + key + value */
>>>> @@ -146,35 +142,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
>>>>       }
>>>>  }
>>>>
>>>> -static inline int htab_lock_bucket(const struct bpf_htab *htab,
>>>> -                                struct bucket *b, u32 hash,
>>>> +static inline int htab_lock_bucket(struct bucket *b,
>>>>                                  unsigned long *pflags)
>>>>  {
>>>>       unsigned long flags;
>>>>
>>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
>>>> -
>>>> -     preempt_disable();
>>>> -     if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
>>>> -             __this_cpu_dec(*(htab->map_locked[hash]));
>>>> -             preempt_enable();
>>>> -             return -EBUSY;
>>>> +     if (in_nmi()) {
>>>> +             if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>>>> +                     return -EBUSY;
>>>> +     } else {
>>>> +             raw_spin_lock_irqsave(&b->raw_lock, flags);
>>>>       }
>>>>
>>>> -     raw_spin_lock_irqsave(&b->raw_lock, flags);
>>>>       *pflags = flags;
>>>> -
>>>>       return 0;
>>>>  }
>>> map_locked is also used to prevent the re-entrance of htab_lock_bucket() on the
>>> same CPU, so only check in_nmi() is not enough.
>> NMI, IRQ, and preempt may interrupt the task context.
>> In htab_lock_bucket, raw_spin_lock_irqsave disable the preempt and
>> irq. so only NMI may interrupt the codes, right ?
> The re-entrance here means the nesting of bpf programs as show below:
>
> bpf_prog A
> update map X
>     htab_lock_bucket
>         raw_spin_lock_irqsave()
>     lookup_elem_raw()
>         // bpf prog B is attached on lookup_elem_raw()
>         bpf prog B
>             update map X again and update the element
>                 htab_lock_bucket()
>                     // dead-lock
>                     raw_spinlock_irqsave()
> .
>
>>>> -static inline void htab_unlock_bucket(const struct bpf_htab *htab,
>>>> -                                   struct bucket *b, u32 hash,
>>>> +static inline void htab_unlock_bucket(struct bucket *b,
>>>>                                     unsigned long flags)
>>>>  {
>>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
>>>>       raw_spin_unlock_irqrestore(&b->raw_lock, flags);
>>>> -     __this_cpu_dec(*(htab->map_locked[hash]));
>>>> -     preempt_enable();
>>>>  }
>>>>
>>>>  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
>>>> @@ -467,7 +454,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>       bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
>>>>       bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
>>>>       struct bpf_htab *htab;
>>>> -     int err, i;
>>>> +     int err;
>>>>
>>>>       htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
>>>>       if (!htab)
>>>> @@ -513,15 +500,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>       if (!htab->buckets)
>>>>               goto free_htab;
>>>>
>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
>>>> -             htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
>>>> -                                                        sizeof(int),
>>>> -                                                        sizeof(int),
>>>> -                                                        GFP_USER);
>>>> -             if (!htab->map_locked[i])
>>>> -                     goto free_map_locked;
>>>> -     }
>>>> -
>>>>       if (htab->map.map_flags & BPF_F_ZERO_SEED)
>>>>               htab->hashrnd = 0;
>>>>       else
>>>> @@ -549,13 +527,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>       if (htab->use_percpu_counter) {
>>>>               err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
>>>>               if (err)
>>>> -                     goto free_map_locked;
>>>> +                     goto free_buckets;
>>>>       }
>>>>
>>>>       if (prealloc) {
>>>>               err = prealloc_init(htab);
>>>>               if (err)
>>>> -                     goto free_map_locked;
>>>> +                     goto free_buckets;
>>>>
>>>>               if (!percpu && !lru) {
>>>>                       /* lru itself can remove the least used element, so
>>>> @@ -568,12 +546,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>       } else {
>>>>               err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
>>>>               if (err)
>>>> -                     goto free_map_locked;
>>>> +                     goto free_buckets;
>>>>               if (percpu) {
>>>>                       err = bpf_mem_alloc_init(&htab->pcpu_ma,
>>>>                                                round_up(htab->map.value_size, 8), true);
>>>>                       if (err)
>>>> -                             goto free_map_locked;
>>>> +                             goto free_buckets;
>>>>               }
>>>>       }
>>>>
>>>> @@ -581,11 +559,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>
>>>>  free_prealloc:
>>>>       prealloc_destroy(htab);
>>>> -free_map_locked:
>>>> +free_buckets:
>>>>       if (htab->use_percpu_counter)
>>>>               percpu_counter_destroy(&htab->pcount);
>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
>>>> -             free_percpu(htab->map_locked[i]);
>>>> +
>>>>       bpf_map_area_free(htab->buckets);
>>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
>>>>       bpf_mem_alloc_destroy(&htab->ma);
>>>> @@ -782,7 +759,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>>>>       b = __select_bucket(htab, tgt_l->hash);
>>>>       head = &b->head;
>>>>
>>>> -     ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>       if (ret)
>>>>               return false;
>>>>
>>>> @@ -793,7 +770,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>>>>                       break;
>>>>               }
>>>>
>>>> -     htab_unlock_bucket(htab, b, tgt_l->hash, flags);
>>>> +     htab_unlock_bucket(b, flags);
>>>>
>>>>       return l == tgt_l;
>>>>  }
>>>> @@ -1107,7 +1084,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>>                */
>>>>       }
>>>>
>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>       if (ret)
>>>>               return ret;
>>>>
>>>> @@ -1152,7 +1129,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>>       }
>>>>       ret = 0;
>>>>  err:
>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>> +     htab_unlock_bucket(b, flags);
>>>>       return ret;
>>>>  }
>>>>
>>>> @@ -1198,7 +1175,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>>       copy_map_value(&htab->map,
>>>>                      l_new->key + round_up(map->key_size, 8), value);
>>>>
>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>       if (ret)
>>>>               return ret;
>>>>
>>>> @@ -1219,7 +1196,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>>       ret = 0;
>>>>
>>>>  err:
>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>> +     htab_unlock_bucket(b, flags);
>>>>
>>>>       if (ret)
>>>>               htab_lru_push_free(htab, l_new);
>>>> @@ -1255,7 +1232,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>>       b = __select_bucket(htab, hash);
>>>>       head = &b->head;
>>>>
>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>       if (ret)
>>>>               return ret;
>>>>
>>>> @@ -1280,7 +1257,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>>       }
>>>>       ret = 0;
>>>>  err:
>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>> +     htab_unlock_bucket(b, flags);
>>>>       return ret;
>>>>  }
>>>>
>>>> @@ -1321,7 +1298,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>>                       return -ENOMEM;
>>>>       }
>>>>
>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>       if (ret)
>>>>               return ret;
>>>>
>>>> @@ -1345,7 +1322,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>>       }
>>>>       ret = 0;
>>>>  err:
>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>> +     htab_unlock_bucket(b, flags);
>>>>       if (l_new)
>>>>               bpf_lru_push_free(&htab->lru, &l_new->lru_node);
>>>>       return ret;
>>>> @@ -1384,7 +1361,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
>>>>       b = __select_bucket(htab, hash);
>>>>       head = &b->head;
>>>>
>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>       if (ret)
>>>>               return ret;
>>>>
>>>> @@ -1397,7 +1374,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
>>>>               ret = -ENOENT;
>>>>       }
>>>>
>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>> +     htab_unlock_bucket(b, flags);
>>>>       return ret;
>>>>  }
>>>>
>>>> @@ -1420,7 +1397,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>>>>       b = __select_bucket(htab, hash);
>>>>       head = &b->head;
>>>>
>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>       if (ret)
>>>>               return ret;
>>>>
>>>> @@ -1431,7 +1408,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>>>>       else
>>>>               ret = -ENOENT;
>>>>
>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>> +     htab_unlock_bucket(b, flags);
>>>>       if (l)
>>>>               htab_lru_push_free(htab, l);
>>>>       return ret;
>>>> @@ -1494,7 +1471,6 @@ static void htab_map_free_timers(struct bpf_map *map)
>>>>  static void htab_map_free(struct bpf_map *map)
>>>>  {
>>>>       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
>>>> -     int i;
>>>>
>>>>       /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
>>>>        * bpf_free_used_maps() is called after bpf prog is no longer executing.
>>>> @@ -1517,10 +1493,10 @@ static void htab_map_free(struct bpf_map *map)
>>>>       bpf_map_area_free(htab->buckets);
>>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
>>>>       bpf_mem_alloc_destroy(&htab->ma);
>>>> +
>>>>       if (htab->use_percpu_counter)
>>>>               percpu_counter_destroy(&htab->pcount);
>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
>>>> -             free_percpu(htab->map_locked[i]);
>>>> +
>>>>       lockdep_unregister_key(&htab->lockdep_key);
>>>>       bpf_map_area_free(htab);
>>>>  }
>>>> @@ -1564,7 +1540,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
>>>>       b = __select_bucket(htab, hash);
>>>>       head = &b->head;
>>>>
>>>> -     ret = htab_lock_bucket(htab, b, hash, &bflags);
>>>> +     ret = htab_lock_bucket(b, &bflags);
>>>>       if (ret)
>>>>               return ret;
>>>>
>>>> @@ -1602,7 +1578,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
>>>>                       free_htab_elem(htab, l);
>>>>       }
>>>>
>>>> -     htab_unlock_bucket(htab, b, hash, bflags);
>>>> +     htab_unlock_bucket(b, bflags);
>>>>
>>>>       if (is_lru_map && l)
>>>>               htab_lru_push_free(htab, l);
>>>> @@ -1720,7 +1696,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>       head = &b->head;
>>>>       /* do not grab the lock unless need it (bucket_cnt > 0). */
>>>>       if (locked) {
>>>> -             ret = htab_lock_bucket(htab, b, batch, &flags);
>>>> +             ret = htab_lock_bucket(b, &flags);
>>>>               if (ret) {
>>>>                       rcu_read_unlock();
>>>>                       bpf_enable_instrumentation();
>>>> @@ -1743,7 +1719,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>               /* Note that since bucket_cnt > 0 here, it is implicit
>>>>                * that the locked was grabbed, so release it.
>>>>                */
>>>> -             htab_unlock_bucket(htab, b, batch, flags);
>>>> +             htab_unlock_bucket(b, flags);
>>>>               rcu_read_unlock();
>>>>               bpf_enable_instrumentation();
>>>>               goto after_loop;
>>>> @@ -1754,7 +1730,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>               /* Note that since bucket_cnt > 0 here, it is implicit
>>>>                * that the locked was grabbed, so release it.
>>>>                */
>>>> -             htab_unlock_bucket(htab, b, batch, flags);
>>>> +             htab_unlock_bucket(b, flags);
>>>>               rcu_read_unlock();
>>>>               bpf_enable_instrumentation();
>>>>               kvfree(keys);
>>>> @@ -1815,7 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>               dst_val += value_size;
>>>>       }
>>>>
>>>> -     htab_unlock_bucket(htab, b, batch, flags);
>>>> +     htab_unlock_bucket(b, flags);
>>>>       locked = false;
>>>>
>>>>       while (node_to_free) {
> .
Tonghao Zhang Nov. 24, 2022, 12:57 p.m. UTC | #6
On Tue, Nov 22, 2022 at 12:06 PM Hou Tao <houtao1@huawei.com> wrote:
>
> Hi,
>
> On 11/22/2022 12:01 PM, Hou Tao wrote:
> > Hi,
> >
> > On 11/22/2022 11:12 AM, Tonghao Zhang wrote:
> >> .
> >>
> >> On Tue, Nov 22, 2022 at 9:16 AM Hou Tao <houtao1@huawei.com> wrote:
> >>> Hi,
> >>>
> >>> On 11/21/2022 6:05 PM, xiangxia.m.yue@gmail.com wrote:
> >>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >>>>
> >>>> The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
> >>>> try to fix deadlock, but in some case, the deadlock occurs:
> >>>>
> >>>> * CPUn in task context with K1, and taking lock.
> >>>> * CPUn interrupted by NMI context, with K2.
> >>>> * They are using the same bucket, but different map_locked.
> >>> It is possible when n_buckets is less than HASHTAB_MAP_LOCK_COUNT (e.g.,
> >>> n_bucket=4). If using hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) as the
> >>> index of map_locked, I think the deadlock will be gone.
> >> Yes, but for saving memory, HASHTAB_MAP_LOCK_MASK should not be too
> >> large(now this value is 8-1).
> >> if user define n_bucket ,e.g 8192, the part of bucket only are
> >> selected via hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1).
> I don't mean to extend map_locked. Using hash & min(HASHTAB_MAP_LOCK_MASK,
> n_bucket - 1) as index of map_locked  can guarantee the same map_locked will be
> used if different update processes are using the same bucket lock.
Thanks, I got it. but I tried it using the hash = hash &
min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1) in
htab_lock_bucket/htab_unlock_bucket.
But the warning occur again.
  > > SNIP
> >>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> >>>> index 22855d6ff6d3..429acd97c869 100644
> >>>> --- a/kernel/bpf/hashtab.c
> >>>> +++ b/kernel/bpf/hashtab.c
> >>>> @@ -80,9 +80,6 @@ struct bucket {
> >>>>       raw_spinlock_t raw_lock;
> >>>>  };
> >>>>
> >>>> -#define HASHTAB_MAP_LOCK_COUNT 8
> >>>> -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
> >>>> -
> >>>>  struct bpf_htab {
> >>>>       struct bpf_map map;
> >>>>       struct bpf_mem_alloc ma;
> >>>> @@ -104,7 +101,6 @@ struct bpf_htab {
> >>>>       u32 elem_size;  /* size of each element in bytes */
> >>>>       u32 hashrnd;
> >>>>       struct lock_class_key lockdep_key;
> >>>> -     int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> >>>>  };
> >>>>
> >>>>  /* each htab element is struct htab_elem + key + value */
> >>>> @@ -146,35 +142,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
> >>>>       }
> >>>>  }
> >>>>
> >>>> -static inline int htab_lock_bucket(const struct bpf_htab *htab,
> >>>> -                                struct bucket *b, u32 hash,
> >>>> +static inline int htab_lock_bucket(struct bucket *b,
> >>>>                                  unsigned long *pflags)
> >>>>  {
> >>>>       unsigned long flags;
> >>>>
> >>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
> >>>> -
> >>>> -     preempt_disable();
> >>>> -     if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
> >>>> -             __this_cpu_dec(*(htab->map_locked[hash]));
> >>>> -             preempt_enable();
> >>>> -             return -EBUSY;
> >>>> +     if (in_nmi()) {
> >>>> +             if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
> >>>> +                     return -EBUSY;
> >>>> +     } else {
> >>>> +             raw_spin_lock_irqsave(&b->raw_lock, flags);
> >>>>       }
> >>>>
> >>>> -     raw_spin_lock_irqsave(&b->raw_lock, flags);
> >>>>       *pflags = flags;
> >>>> -
> >>>>       return 0;
> >>>>  }
> >>> map_locked is also used to prevent the re-entrance of htab_lock_bucket() on the
> >>> same CPU, so only check in_nmi() is not enough.
> >> NMI, IRQ, and preempt may interrupt the task context.
> >> In htab_lock_bucket, raw_spin_lock_irqsave disable the preempt and
> >> irq. so only NMI may interrupt the codes, right ?
> > The re-entrance here means the nesting of bpf programs as show below:
> >
> > bpf_prog A
> > update map X
> >     htab_lock_bucket
> >         raw_spin_lock_irqsave()
> >     lookup_elem_raw()
> >         // bpf prog B is attached on lookup_elem_raw()
I am confused, bpf_prog A disables preempt and irq with
raw_spin_lock_irqsave. Why bpf prog B run here?
> >         bpf prog B
> >             update map X again and update the element
> >                 htab_lock_bucket()
> >                     // dead-lock
> >                     raw_spinlock_irqsave()
> > .
> >
> >>>> -static inline void htab_unlock_bucket(const struct bpf_htab *htab,
> >>>> -                                   struct bucket *b, u32 hash,
> >>>> +static inline void htab_unlock_bucket(struct bucket *b,
> >>>>                                     unsigned long flags)
> >>>>  {
> >>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
> >>>>       raw_spin_unlock_irqrestore(&b->raw_lock, flags);
> >>>> -     __this_cpu_dec(*(htab->map_locked[hash]));
> >>>> -     preempt_enable();
> >>>>  }
> >>>>
> >>>>  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
> >>>> @@ -467,7 +454,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>       bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
> >>>>       bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
> >>>>       struct bpf_htab *htab;
> >>>> -     int err, i;
> >>>> +     int err;
> >>>>
> >>>>       htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
> >>>>       if (!htab)
> >>>> @@ -513,15 +500,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>       if (!htab->buckets)
> >>>>               goto free_htab;
> >>>>
> >>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
> >>>> -             htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
> >>>> -                                                        sizeof(int),
> >>>> -                                                        sizeof(int),
> >>>> -                                                        GFP_USER);
> >>>> -             if (!htab->map_locked[i])
> >>>> -                     goto free_map_locked;
> >>>> -     }
> >>>> -
> >>>>       if (htab->map.map_flags & BPF_F_ZERO_SEED)
> >>>>               htab->hashrnd = 0;
> >>>>       else
> >>>> @@ -549,13 +527,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>       if (htab->use_percpu_counter) {
> >>>>               err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
> >>>>               if (err)
> >>>> -                     goto free_map_locked;
> >>>> +                     goto free_buckets;
> >>>>       }
> >>>>
> >>>>       if (prealloc) {
> >>>>               err = prealloc_init(htab);
> >>>>               if (err)
> >>>> -                     goto free_map_locked;
> >>>> +                     goto free_buckets;
> >>>>
> >>>>               if (!percpu && !lru) {
> >>>>                       /* lru itself can remove the least used element, so
> >>>> @@ -568,12 +546,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>       } else {
> >>>>               err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
> >>>>               if (err)
> >>>> -                     goto free_map_locked;
> >>>> +                     goto free_buckets;
> >>>>               if (percpu) {
> >>>>                       err = bpf_mem_alloc_init(&htab->pcpu_ma,
> >>>>                                                round_up(htab->map.value_size, 8), true);
> >>>>                       if (err)
> >>>> -                             goto free_map_locked;
> >>>> +                             goto free_buckets;
> >>>>               }
> >>>>       }
> >>>>
> >>>> @@ -581,11 +559,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>
> >>>>  free_prealloc:
> >>>>       prealloc_destroy(htab);
> >>>> -free_map_locked:
> >>>> +free_buckets:
> >>>>       if (htab->use_percpu_counter)
> >>>>               percpu_counter_destroy(&htab->pcount);
> >>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
> >>>> -             free_percpu(htab->map_locked[i]);
> >>>> +
> >>>>       bpf_map_area_free(htab->buckets);
> >>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
> >>>>       bpf_mem_alloc_destroy(&htab->ma);
> >>>> @@ -782,7 +759,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >>>>       b = __select_bucket(htab, tgt_l->hash);
> >>>>       head = &b->head;
> >>>>
> >>>> -     ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
> >>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>       if (ret)
> >>>>               return false;
> >>>>
> >>>> @@ -793,7 +770,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >>>>                       break;
> >>>>               }
> >>>>
> >>>> -     htab_unlock_bucket(htab, b, tgt_l->hash, flags);
> >>>> +     htab_unlock_bucket(b, flags);
> >>>>
> >>>>       return l == tgt_l;
> >>>>  }
> >>>> @@ -1107,7 +1084,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> >>>>                */
> >>>>       }
> >>>>
> >>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>       if (ret)
> >>>>               return ret;
> >>>>
> >>>> @@ -1152,7 +1129,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> >>>>       }
> >>>>       ret = 0;
> >>>>  err:
> >>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>> +     htab_unlock_bucket(b, flags);
> >>>>       return ret;
> >>>>  }
> >>>>
> >>>> @@ -1198,7 +1175,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
> >>>>       copy_map_value(&htab->map,
> >>>>                      l_new->key + round_up(map->key_size, 8), value);
> >>>>
> >>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>       if (ret)
> >>>>               return ret;
> >>>>
> >>>> @@ -1219,7 +1196,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
> >>>>       ret = 0;
> >>>>
> >>>>  err:
> >>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>> +     htab_unlock_bucket(b, flags);
> >>>>
> >>>>       if (ret)
> >>>>               htab_lru_push_free(htab, l_new);
> >>>> @@ -1255,7 +1232,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
> >>>>       b = __select_bucket(htab, hash);
> >>>>       head = &b->head;
> >>>>
> >>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>       if (ret)
> >>>>               return ret;
> >>>>
> >>>> @@ -1280,7 +1257,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
> >>>>       }
> >>>>       ret = 0;
> >>>>  err:
> >>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>> +     htab_unlock_bucket(b, flags);
> >>>>       return ret;
> >>>>  }
> >>>>
> >>>> @@ -1321,7 +1298,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
> >>>>                       return -ENOMEM;
> >>>>       }
> >>>>
> >>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>       if (ret)
> >>>>               return ret;
> >>>>
> >>>> @@ -1345,7 +1322,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
> >>>>       }
> >>>>       ret = 0;
> >>>>  err:
> >>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>> +     htab_unlock_bucket(b, flags);
> >>>>       if (l_new)
> >>>>               bpf_lru_push_free(&htab->lru, &l_new->lru_node);
> >>>>       return ret;
> >>>> @@ -1384,7 +1361,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
> >>>>       b = __select_bucket(htab, hash);
> >>>>       head = &b->head;
> >>>>
> >>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>       if (ret)
> >>>>               return ret;
> >>>>
> >>>> @@ -1397,7 +1374,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
> >>>>               ret = -ENOENT;
> >>>>       }
> >>>>
> >>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>> +     htab_unlock_bucket(b, flags);
> >>>>       return ret;
> >>>>  }
> >>>>
> >>>> @@ -1420,7 +1397,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
> >>>>       b = __select_bucket(htab, hash);
> >>>>       head = &b->head;
> >>>>
> >>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>       if (ret)
> >>>>               return ret;
> >>>>
> >>>> @@ -1431,7 +1408,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
> >>>>       else
> >>>>               ret = -ENOENT;
> >>>>
> >>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>> +     htab_unlock_bucket(b, flags);
> >>>>       if (l)
> >>>>               htab_lru_push_free(htab, l);
> >>>>       return ret;
> >>>> @@ -1494,7 +1471,6 @@ static void htab_map_free_timers(struct bpf_map *map)
> >>>>  static void htab_map_free(struct bpf_map *map)
> >>>>  {
> >>>>       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> >>>> -     int i;
> >>>>
> >>>>       /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
> >>>>        * bpf_free_used_maps() is called after bpf prog is no longer executing.
> >>>> @@ -1517,10 +1493,10 @@ static void htab_map_free(struct bpf_map *map)
> >>>>       bpf_map_area_free(htab->buckets);
> >>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
> >>>>       bpf_mem_alloc_destroy(&htab->ma);
> >>>> +
> >>>>       if (htab->use_percpu_counter)
> >>>>               percpu_counter_destroy(&htab->pcount);
> >>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
> >>>> -             free_percpu(htab->map_locked[i]);
> >>>> +
> >>>>       lockdep_unregister_key(&htab->lockdep_key);
> >>>>       bpf_map_area_free(htab);
> >>>>  }
> >>>> @@ -1564,7 +1540,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
> >>>>       b = __select_bucket(htab, hash);
> >>>>       head = &b->head;
> >>>>
> >>>> -     ret = htab_lock_bucket(htab, b, hash, &bflags);
> >>>> +     ret = htab_lock_bucket(b, &bflags);
> >>>>       if (ret)
> >>>>               return ret;
> >>>>
> >>>> @@ -1602,7 +1578,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
> >>>>                       free_htab_elem(htab, l);
> >>>>       }
> >>>>
> >>>> -     htab_unlock_bucket(htab, b, hash, bflags);
> >>>> +     htab_unlock_bucket(b, bflags);
> >>>>
> >>>>       if (is_lru_map && l)
> >>>>               htab_lru_push_free(htab, l);
> >>>> @@ -1720,7 +1696,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>       head = &b->head;
> >>>>       /* do not grab the lock unless need it (bucket_cnt > 0). */
> >>>>       if (locked) {
> >>>> -             ret = htab_lock_bucket(htab, b, batch, &flags);
> >>>> +             ret = htab_lock_bucket(b, &flags);
> >>>>               if (ret) {
> >>>>                       rcu_read_unlock();
> >>>>                       bpf_enable_instrumentation();
> >>>> @@ -1743,7 +1719,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>               /* Note that since bucket_cnt > 0 here, it is implicit
> >>>>                * that the locked was grabbed, so release it.
> >>>>                */
> >>>> -             htab_unlock_bucket(htab, b, batch, flags);
> >>>> +             htab_unlock_bucket(b, flags);
> >>>>               rcu_read_unlock();
> >>>>               bpf_enable_instrumentation();
> >>>>               goto after_loop;
> >>>> @@ -1754,7 +1730,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>               /* Note that since bucket_cnt > 0 here, it is implicit
> >>>>                * that the locked was grabbed, so release it.
> >>>>                */
> >>>> -             htab_unlock_bucket(htab, b, batch, flags);
> >>>> +             htab_unlock_bucket(b, flags);
> >>>>               rcu_read_unlock();
> >>>>               bpf_enable_instrumentation();
> >>>>               kvfree(keys);
> >>>> @@ -1815,7 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>               dst_val += value_size;
> >>>>       }
> >>>>
> >>>> -     htab_unlock_bucket(htab, b, batch, flags);
> >>>> +     htab_unlock_bucket(b, flags);
> >>>>       locked = false;
> >>>>
> >>>>       while (node_to_free) {
> > .
>
Hou Tao Nov. 24, 2022, 2:13 p.m. UTC | #7
Hi,

On 11/24/2022 8:57 PM, Tonghao Zhang wrote:
> On Tue, Nov 22, 2022 at 12:06 PM Hou Tao <houtao1@huawei.com> wrote:
>> Hi,
>>
>> On 11/22/2022 12:01 PM, Hou Tao wrote:
>>> Hi,
>>>
>>> On 11/22/2022 11:12 AM, Tonghao Zhang wrote:
>>>> .
>>>>
>>>> On Tue, Nov 22, 2022 at 9:16 AM Hou Tao <houtao1@huawei.com> wrote:
>>>>> Hi,
>>>>>
>>>>> On 11/21/2022 6:05 PM, xiangxia.m.yue@gmail.com wrote:
>>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>>>>>>
>>>>>> The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
>>>>>> try to fix deadlock, but in some case, the deadlock occurs:
>>>>>>
>>>>>> * CPUn in task context with K1, and taking lock.
>>>>>> * CPUn interrupted by NMI context, with K2.
>>>>>> * They are using the same bucket, but different map_locked.
>>>>> It is possible when n_buckets is less than HASHTAB_MAP_LOCK_COUNT (e.g.,
>>>>> n_bucket=4). If using hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) as the
>>>>> index of map_locked, I think the deadlock will be gone.
>>>> Yes, but for saving memory, HASHTAB_MAP_LOCK_MASK should not be too
>>>> large(now this value is 8-1).
>>>> if user define n_bucket ,e.g 8192, the part of bucket only are
>>>> selected via hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1).
>> I don't mean to extend map_locked. Using hash & min(HASHTAB_MAP_LOCK_MASK,
>> n_bucket - 1) as index of map_locked  can guarantee the same map_locked will be
>> used if different update processes are using the same bucket lock.
> Thanks, I got it. but I tried it using the hash = hash &
> min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1) in
> htab_lock_bucket/htab_unlock_bucket.
> But the warning occur again.
Does the deadlock happen ? Or just get the warning from lockdep. Maybe it is
just a false alarm from lockdep. I will check tomorrow. Could you share the
steps on how to reproduce the problem, specially the size of the hash table ?
>   > > SNIP
>>>>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>>>>> index 22855d6ff6d3..429acd97c869 100644
>>>>>> --- a/kernel/bpf/hashtab.c
>>>>>> +++ b/kernel/bpf/hashtab.c
>>>>>> @@ -80,9 +80,6 @@ struct bucket {
>>>>>>       raw_spinlock_t raw_lock;
>>>>>>  };
>>>>>>
>>>>>> -#define HASHTAB_MAP_LOCK_COUNT 8
>>>>>> -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
>>>>>> -
>>>>>>  struct bpf_htab {
>>>>>>       struct bpf_map map;
>>>>>>       struct bpf_mem_alloc ma;
>>>>>> @@ -104,7 +101,6 @@ struct bpf_htab {
>>>>>>       u32 elem_size;  /* size of each element in bytes */
>>>>>>       u32 hashrnd;
>>>>>>       struct lock_class_key lockdep_key;
>>>>>> -     int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
>>>>>>  };
>>>>>>
>>>>>>  /* each htab element is struct htab_elem + key + value */
>>>>>> @@ -146,35 +142,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
>>>>>>       }
>>>>>>  }
>>>>>>
>>>>>> -static inline int htab_lock_bucket(const struct bpf_htab *htab,
>>>>>> -                                struct bucket *b, u32 hash,
>>>>>> +static inline int htab_lock_bucket(struct bucket *b,
>>>>>>                                  unsigned long *pflags)
>>>>>>  {
>>>>>>       unsigned long flags;
>>>>>>
>>>>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
>>>>>> -
>>>>>> -     preempt_disable();
>>>>>> -     if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
>>>>>> -             __this_cpu_dec(*(htab->map_locked[hash]));
>>>>>> -             preempt_enable();
>>>>>> -             return -EBUSY;
>>>>>> +     if (in_nmi()) {
>>>>>> +             if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>>>>>> +                     return -EBUSY;
>>>>>> +     } else {
>>>>>> +             raw_spin_lock_irqsave(&b->raw_lock, flags);
>>>>>>       }
>>>>>>
>>>>>> -     raw_spin_lock_irqsave(&b->raw_lock, flags);
>>>>>>       *pflags = flags;
>>>>>> -
>>>>>>       return 0;
>>>>>>  }
>>>>> map_locked is also used to prevent the re-entrance of htab_lock_bucket() on the
>>>>> same CPU, so only check in_nmi() is not enough.
>>>> NMI, IRQ, and preempt may interrupt the task context.
>>>> In htab_lock_bucket, raw_spin_lock_irqsave disable the preempt and
>>>> irq. so only NMI may interrupt the codes, right ?
>>> The re-entrance here means the nesting of bpf programs as show below:
>>>
>>> bpf_prog A
>>> update map X
>>>     htab_lock_bucket
>>>         raw_spin_lock_irqsave()
>>>     lookup_elem_raw()
>>>         // bpf prog B is attached on lookup_elem_raw()
> I am confused, bpf_prog A disables preempt and irq with
> raw_spin_lock_irqsave. Why bpf prog B run here?
Because program B (e.g., fentry program) has been attached on lookup_elem_raw(),
calling lookup_elem_raw() will call the fentry program first. I had written a
test case for the similar scenario in bpf selftests. The path of the test case
is tools/testing/selftests/bpf/prog_tests/htab_update.c, you can use ./test_prog
-t htab_update/reenter_update to run the test case.
>>>         bpf prog B
>>>             update map X again and update the element
>>>                 htab_lock_bucket()
>>>                     // dead-lock
>>>                     raw_spinlock_irqsave()
>>> .
>>>
>>>>>> -static inline void htab_unlock_bucket(const struct bpf_htab *htab,
>>>>>> -                                   struct bucket *b, u32 hash,
>>>>>> +static inline void htab_unlock_bucket(struct bucket *b,
>>>>>>                                     unsigned long flags)
>>>>>>  {
>>>>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
>>>>>>       raw_spin_unlock_irqrestore(&b->raw_lock, flags);
>>>>>> -     __this_cpu_dec(*(htab->map_locked[hash]));
>>>>>> -     preempt_enable();
>>>>>>  }
>>>>>>
>>>>>>  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
>>>>>> @@ -467,7 +454,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>>>       bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
>>>>>>       bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
>>>>>>       struct bpf_htab *htab;
>>>>>> -     int err, i;
>>>>>> +     int err;
>>>>>>
>>>>>>       htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
>>>>>>       if (!htab)
>>>>>> @@ -513,15 +500,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>>>       if (!htab->buckets)
>>>>>>               goto free_htab;
>>>>>>
>>>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
>>>>>> -             htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
>>>>>> -                                                        sizeof(int),
>>>>>> -                                                        sizeof(int),
>>>>>> -                                                        GFP_USER);
>>>>>> -             if (!htab->map_locked[i])
>>>>>> -                     goto free_map_locked;
>>>>>> -     }
>>>>>> -
>>>>>>       if (htab->map.map_flags & BPF_F_ZERO_SEED)
>>>>>>               htab->hashrnd = 0;
>>>>>>       else
>>>>>> @@ -549,13 +527,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>>>       if (htab->use_percpu_counter) {
>>>>>>               err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
>>>>>>               if (err)
>>>>>> -                     goto free_map_locked;
>>>>>> +                     goto free_buckets;
>>>>>>       }
>>>>>>
>>>>>>       if (prealloc) {
>>>>>>               err = prealloc_init(htab);
>>>>>>               if (err)
>>>>>> -                     goto free_map_locked;
>>>>>> +                     goto free_buckets;
>>>>>>
>>>>>>               if (!percpu && !lru) {
>>>>>>                       /* lru itself can remove the least used element, so
>>>>>> @@ -568,12 +546,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>>>       } else {
>>>>>>               err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
>>>>>>               if (err)
>>>>>> -                     goto free_map_locked;
>>>>>> +                     goto free_buckets;
>>>>>>               if (percpu) {
>>>>>>                       err = bpf_mem_alloc_init(&htab->pcpu_ma,
>>>>>>                                                round_up(htab->map.value_size, 8), true);
>>>>>>                       if (err)
>>>>>> -                             goto free_map_locked;
>>>>>> +                             goto free_buckets;
>>>>>>               }
>>>>>>       }
>>>>>>
>>>>>> @@ -581,11 +559,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>>>>
>>>>>>  free_prealloc:
>>>>>>       prealloc_destroy(htab);
>>>>>> -free_map_locked:
>>>>>> +free_buckets:
>>>>>>       if (htab->use_percpu_counter)
>>>>>>               percpu_counter_destroy(&htab->pcount);
>>>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
>>>>>> -             free_percpu(htab->map_locked[i]);
>>>>>> +
>>>>>>       bpf_map_area_free(htab->buckets);
>>>>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
>>>>>>       bpf_mem_alloc_destroy(&htab->ma);
>>>>>> @@ -782,7 +759,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>>>>>>       b = __select_bucket(htab, tgt_l->hash);
>>>>>>       head = &b->head;
>>>>>>
>>>>>> -     ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
>>>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>>>       if (ret)
>>>>>>               return false;
>>>>>>
>>>>>> @@ -793,7 +770,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>>>>>>                       break;
>>>>>>               }
>>>>>>
>>>>>> -     htab_unlock_bucket(htab, b, tgt_l->hash, flags);
>>>>>> +     htab_unlock_bucket(b, flags);
>>>>>>
>>>>>>       return l == tgt_l;
>>>>>>  }
>>>>>> @@ -1107,7 +1084,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>>>>                */
>>>>>>       }
>>>>>>
>>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>>>       if (ret)
>>>>>>               return ret;
>>>>>>
>>>>>> @@ -1152,7 +1129,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>>>>       }
>>>>>>       ret = 0;
>>>>>>  err:
>>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>>>> +     htab_unlock_bucket(b, flags);
>>>>>>       return ret;
>>>>>>  }
>>>>>>
>>>>>> @@ -1198,7 +1175,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>>>>       copy_map_value(&htab->map,
>>>>>>                      l_new->key + round_up(map->key_size, 8), value);
>>>>>>
>>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>>>       if (ret)
>>>>>>               return ret;
>>>>>>
>>>>>> @@ -1219,7 +1196,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
>>>>>>       ret = 0;
>>>>>>
>>>>>>  err:
>>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>>>> +     htab_unlock_bucket(b, flags);
>>>>>>
>>>>>>       if (ret)
>>>>>>               htab_lru_push_free(htab, l_new);
>>>>>> @@ -1255,7 +1232,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>>>>       b = __select_bucket(htab, hash);
>>>>>>       head = &b->head;
>>>>>>
>>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>>>       if (ret)
>>>>>>               return ret;
>>>>>>
>>>>>> @@ -1280,7 +1257,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>>>>       }
>>>>>>       ret = 0;
>>>>>>  err:
>>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>>>> +     htab_unlock_bucket(b, flags);
>>>>>>       return ret;
>>>>>>  }
>>>>>>
>>>>>> @@ -1321,7 +1298,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>>>>                       return -ENOMEM;
>>>>>>       }
>>>>>>
>>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>>>       if (ret)
>>>>>>               return ret;
>>>>>>
>>>>>> @@ -1345,7 +1322,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>>>>>>       }
>>>>>>       ret = 0;
>>>>>>  err:
>>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>>>> +     htab_unlock_bucket(b, flags);
>>>>>>       if (l_new)
>>>>>>               bpf_lru_push_free(&htab->lru, &l_new->lru_node);
>>>>>>       return ret;
>>>>>> @@ -1384,7 +1361,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
>>>>>>       b = __select_bucket(htab, hash);
>>>>>>       head = &b->head;
>>>>>>
>>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>>>       if (ret)
>>>>>>               return ret;
>>>>>>
>>>>>> @@ -1397,7 +1374,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
>>>>>>               ret = -ENOENT;
>>>>>>       }
>>>>>>
>>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>>>> +     htab_unlock_bucket(b, flags);
>>>>>>       return ret;
>>>>>>  }
>>>>>>
>>>>>> @@ -1420,7 +1397,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>>>>>>       b = __select_bucket(htab, hash);
>>>>>>       head = &b->head;
>>>>>>
>>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
>>>>>> +     ret = htab_lock_bucket(b, &flags);
>>>>>>       if (ret)
>>>>>>               return ret;
>>>>>>
>>>>>> @@ -1431,7 +1408,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>>>>>>       else
>>>>>>               ret = -ENOENT;
>>>>>>
>>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
>>>>>> +     htab_unlock_bucket(b, flags);
>>>>>>       if (l)
>>>>>>               htab_lru_push_free(htab, l);
>>>>>>       return ret;
>>>>>> @@ -1494,7 +1471,6 @@ static void htab_map_free_timers(struct bpf_map *map)
>>>>>>  static void htab_map_free(struct bpf_map *map)
>>>>>>  {
>>>>>>       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
>>>>>> -     int i;
>>>>>>
>>>>>>       /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
>>>>>>        * bpf_free_used_maps() is called after bpf prog is no longer executing.
>>>>>> @@ -1517,10 +1493,10 @@ static void htab_map_free(struct bpf_map *map)
>>>>>>       bpf_map_area_free(htab->buckets);
>>>>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
>>>>>>       bpf_mem_alloc_destroy(&htab->ma);
>>>>>> +
>>>>>>       if (htab->use_percpu_counter)
>>>>>>               percpu_counter_destroy(&htab->pcount);
>>>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
>>>>>> -             free_percpu(htab->map_locked[i]);
>>>>>> +
>>>>>>       lockdep_unregister_key(&htab->lockdep_key);
>>>>>>       bpf_map_area_free(htab);
>>>>>>  }
>>>>>> @@ -1564,7 +1540,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
>>>>>>       b = __select_bucket(htab, hash);
>>>>>>       head = &b->head;
>>>>>>
>>>>>> -     ret = htab_lock_bucket(htab, b, hash, &bflags);
>>>>>> +     ret = htab_lock_bucket(b, &bflags);
>>>>>>       if (ret)
>>>>>>               return ret;
>>>>>>
>>>>>> @@ -1602,7 +1578,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
>>>>>>                       free_htab_elem(htab, l);
>>>>>>       }
>>>>>>
>>>>>> -     htab_unlock_bucket(htab, b, hash, bflags);
>>>>>> +     htab_unlock_bucket(b, bflags);
>>>>>>
>>>>>>       if (is_lru_map && l)
>>>>>>               htab_lru_push_free(htab, l);
>>>>>> @@ -1720,7 +1696,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>>>       head = &b->head;
>>>>>>       /* do not grab the lock unless need it (bucket_cnt > 0). */
>>>>>>       if (locked) {
>>>>>> -             ret = htab_lock_bucket(htab, b, batch, &flags);
>>>>>> +             ret = htab_lock_bucket(b, &flags);
>>>>>>               if (ret) {
>>>>>>                       rcu_read_unlock();
>>>>>>                       bpf_enable_instrumentation();
>>>>>> @@ -1743,7 +1719,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>>>               /* Note that since bucket_cnt > 0 here, it is implicit
>>>>>>                * that the locked was grabbed, so release it.
>>>>>>                */
>>>>>> -             htab_unlock_bucket(htab, b, batch, flags);
>>>>>> +             htab_unlock_bucket(b, flags);
>>>>>>               rcu_read_unlock();
>>>>>>               bpf_enable_instrumentation();
>>>>>>               goto after_loop;
>>>>>> @@ -1754,7 +1730,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>>>               /* Note that since bucket_cnt > 0 here, it is implicit
>>>>>>                * that the locked was grabbed, so release it.
>>>>>>                */
>>>>>> -             htab_unlock_bucket(htab, b, batch, flags);
>>>>>> +             htab_unlock_bucket(b, flags);
>>>>>>               rcu_read_unlock();
>>>>>>               bpf_enable_instrumentation();
>>>>>>               kvfree(keys);
>>>>>> @@ -1815,7 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>>>>>               dst_val += value_size;
>>>>>>       }
>>>>>>
>>>>>> -     htab_unlock_bucket(htab, b, batch, flags);
>>>>>> +     htab_unlock_bucket(b, flags);
>>>>>>       locked = false;
>>>>>>
>>>>>>       while (node_to_free) {
>>> .
>
Tonghao Zhang Nov. 28, 2022, 3:15 a.m. UTC | #8
On Thu, Nov 24, 2022 at 10:13 PM Hou Tao <houtao1@huawei.com> wrote:
>
> Hi,
>
> On 11/24/2022 8:57 PM, Tonghao Zhang wrote:
> > On Tue, Nov 22, 2022 at 12:06 PM Hou Tao <houtao1@huawei.com> wrote:
> >> Hi,
> >>
> >> On 11/22/2022 12:01 PM, Hou Tao wrote:
> >>> Hi,
> >>>
> >>> On 11/22/2022 11:12 AM, Tonghao Zhang wrote:
> >>>> .
> >>>>
> >>>> On Tue, Nov 22, 2022 at 9:16 AM Hou Tao <houtao1@huawei.com> wrote:
> >>>>> Hi,
> >>>>>
> >>>>> On 11/21/2022 6:05 PM, xiangxia.m.yue@gmail.com wrote:
> >>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >>>>>>
> >>>>>> The commit 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked"),
> >>>>>> try to fix deadlock, but in some case, the deadlock occurs:
> >>>>>>
> >>>>>> * CPUn in task context with K1, and taking lock.
> >>>>>> * CPUn interrupted by NMI context, with K2.
> >>>>>> * They are using the same bucket, but different map_locked.
> >>>>> It is possible when n_buckets is less than HASHTAB_MAP_LOCK_COUNT (e.g.,
> >>>>> n_bucket=4). If using hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) as the
> >>>>> index of map_locked, I think the deadlock will be gone.
> >>>> Yes, but for saving memory, HASHTAB_MAP_LOCK_MASK should not be too
> >>>> large(now this value is 8-1).
> >>>> if user define n_bucket ,e.g 8192, the part of bucket only are
> >>>> selected via hash & min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1).
> >> I don't mean to extend map_locked. Using hash & min(HASHTAB_MAP_LOCK_MASK,
> >> n_bucket - 1) as index of map_locked  can guarantee the same map_locked will be
> >> used if different update processes are using the same bucket lock.
> > Thanks, I got it. but I tried it using the hash = hash &
> > min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1) in
> > htab_lock_bucket/htab_unlock_bucket.
> > But the warning occur again.
> Does the deadlock happen ? Or just get the warning from lockdep. Maybe it is
> just a false alarm from lockdep. I will check tomorrow. Could you share the
> steps on how to reproduce the problem, specially the size of the hash table ?
Hi
only a warning from lockdep.
1. the kernel .config
#
# Debug Oops, Lockups and Hangs
#
CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_ON_OOPS_VALUE=1
CONFIG_PANIC_TIMEOUT=0
CONFIG_LOCKUP_DETECTOR=y
CONFIG_SOFTLOCKUP_DETECTOR=y
# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
CONFIG_HARDLOCKUP_DETECTOR_PERF=y
CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y
CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
# CONFIG_WQ_WATCHDOG is not set
# CONFIG_TEST_LOCKUP is not set
# end of Debug Oops, Lockups and Hangs

2. bpf.c, the map size is 2.
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 2);
__uint(key_size, sizeof(unsigned int));
__uint(value_size, sizeof(unsigned int));
} map1 SEC(".maps");

static int bpf_update_data()
{
unsigned int val = 1, key = 0;

return bpf_map_update_elem(&map1, &key, &val, BPF_ANY);
}

SEC("kprobe/ip_rcv")
int bpf_prog1(struct pt_regs *regs)
{
bpf_update_data();
return 0;
}

SEC("tracepoint/nmi/nmi_handler")
int bpf_prog2(struct pt_regs *regs)
{
bpf_update_data();
return 0;
}

char _license[] SEC("license") = "GPL";
unsigned int _version SEC("version") = LINUX_VERSION_CODE;

3. bpf loader.
#include "kprobe-example.skel.h"

#include <unistd.h>
#include <errno.h>

#include <bpf/bpf.h>

int main()
{
struct kprobe_example *skel;
int map_fd, prog_fd;
int i;
int err = 0;

skel = kprobe_example__open_and_load();
if (!skel)
return -1;

err = kprobe_example__attach(skel);
if (err)
goto cleanup;

/* all libbpf APIs are usable */
prog_fd = bpf_program__fd(skel->progs.bpf_prog1);
map_fd = bpf_map__fd(skel->maps.map1);

printf("map_fd: %d\n", map_fd);

unsigned int val = 0, key = 0;

while (1) {
bpf_map_delete_elem(map_fd, &key);
bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
}

cleanup:
kprobe_example__destroy(skel);
return err;
}

4. run the bpf loader and perf record for nmi interrupts.  the warming occurs

> >   > > SNIP
> >>>>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> >>>>>> index 22855d6ff6d3..429acd97c869 100644
> >>>>>> --- a/kernel/bpf/hashtab.c
> >>>>>> +++ b/kernel/bpf/hashtab.c
> >>>>>> @@ -80,9 +80,6 @@ struct bucket {
> >>>>>>       raw_spinlock_t raw_lock;
> >>>>>>  };
> >>>>>>
> >>>>>> -#define HASHTAB_MAP_LOCK_COUNT 8
> >>>>>> -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
> >>>>>> -
> >>>>>>  struct bpf_htab {
> >>>>>>       struct bpf_map map;
> >>>>>>       struct bpf_mem_alloc ma;
> >>>>>> @@ -104,7 +101,6 @@ struct bpf_htab {
> >>>>>>       u32 elem_size;  /* size of each element in bytes */
> >>>>>>       u32 hashrnd;
> >>>>>>       struct lock_class_key lockdep_key;
> >>>>>> -     int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> >>>>>>  };
> >>>>>>
> >>>>>>  /* each htab element is struct htab_elem + key + value */
> >>>>>> @@ -146,35 +142,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
> >>>>>>       }
> >>>>>>  }
> >>>>>>
> >>>>>> -static inline int htab_lock_bucket(const struct bpf_htab *htab,
> >>>>>> -                                struct bucket *b, u32 hash,
> >>>>>> +static inline int htab_lock_bucket(struct bucket *b,
> >>>>>>                                  unsigned long *pflags)
> >>>>>>  {
> >>>>>>       unsigned long flags;
> >>>>>>
> >>>>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
> >>>>>> -
> >>>>>> -     preempt_disable();
> >>>>>> -     if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
> >>>>>> -             __this_cpu_dec(*(htab->map_locked[hash]));
> >>>>>> -             preempt_enable();
> >>>>>> -             return -EBUSY;
> >>>>>> +     if (in_nmi()) {
> >>>>>> +             if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
> >>>>>> +                     return -EBUSY;
> >>>>>> +     } else {
> >>>>>> +             raw_spin_lock_irqsave(&b->raw_lock, flags);
> >>>>>>       }
> >>>>>>
> >>>>>> -     raw_spin_lock_irqsave(&b->raw_lock, flags);
> >>>>>>       *pflags = flags;
> >>>>>> -
> >>>>>>       return 0;
> >>>>>>  }
> >>>>> map_locked is also used to prevent the re-entrance of htab_lock_bucket() on the
> >>>>> same CPU, so only check in_nmi() is not enough.
> >>>> NMI, IRQ, and preempt may interrupt the task context.
> >>>> In htab_lock_bucket, raw_spin_lock_irqsave disable the preempt and
> >>>> irq. so only NMI may interrupt the codes, right ?
> >>> The re-entrance here means the nesting of bpf programs as show below:
> >>>
> >>> bpf_prog A
> >>> update map X
> >>>     htab_lock_bucket
> >>>         raw_spin_lock_irqsave()
> >>>     lookup_elem_raw()
> >>>         // bpf prog B is attached on lookup_elem_raw()
> > I am confused, bpf_prog A disables preempt and irq with
> > raw_spin_lock_irqsave. Why bpf prog B run here?
> Because program B (e.g., fentry program) has been attached on lookup_elem_raw(),
> calling lookup_elem_raw() will call the fentry program first. I had written a
> test case for the similar scenario in bpf selftests. The path of the test case
> is tools/testing/selftests/bpf/prog_tests/htab_update.c, you can use ./test_prog
> -t htab_update/reenter_update to run the test case.
> >>>         bpf prog B
> >>>             update map X again and update the element
> >>>                 htab_lock_bucket()
> >>>                     // dead-lock
> >>>                     raw_spinlock_irqsave()
> >>> .
> >>>
> >>>>>> -static inline void htab_unlock_bucket(const struct bpf_htab *htab,
> >>>>>> -                                   struct bucket *b, u32 hash,
> >>>>>> +static inline void htab_unlock_bucket(struct bucket *b,
> >>>>>>                                     unsigned long flags)
> >>>>>>  {
> >>>>>> -     hash = hash & HASHTAB_MAP_LOCK_MASK;
> >>>>>>       raw_spin_unlock_irqrestore(&b->raw_lock, flags);
> >>>>>> -     __this_cpu_dec(*(htab->map_locked[hash]));
> >>>>>> -     preempt_enable();
> >>>>>>  }
> >>>>>>
> >>>>>>  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
> >>>>>> @@ -467,7 +454,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>>>       bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
> >>>>>>       bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
> >>>>>>       struct bpf_htab *htab;
> >>>>>> -     int err, i;
> >>>>>> +     int err;
> >>>>>>
> >>>>>>       htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
> >>>>>>       if (!htab)
> >>>>>> @@ -513,15 +500,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>>>       if (!htab->buckets)
> >>>>>>               goto free_htab;
> >>>>>>
> >>>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
> >>>>>> -             htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
> >>>>>> -                                                        sizeof(int),
> >>>>>> -                                                        sizeof(int),
> >>>>>> -                                                        GFP_USER);
> >>>>>> -             if (!htab->map_locked[i])
> >>>>>> -                     goto free_map_locked;
> >>>>>> -     }
> >>>>>> -
> >>>>>>       if (htab->map.map_flags & BPF_F_ZERO_SEED)
> >>>>>>               htab->hashrnd = 0;
> >>>>>>       else
> >>>>>> @@ -549,13 +527,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>>>       if (htab->use_percpu_counter) {
> >>>>>>               err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
> >>>>>>               if (err)
> >>>>>> -                     goto free_map_locked;
> >>>>>> +                     goto free_buckets;
> >>>>>>       }
> >>>>>>
> >>>>>>       if (prealloc) {
> >>>>>>               err = prealloc_init(htab);
> >>>>>>               if (err)
> >>>>>> -                     goto free_map_locked;
> >>>>>> +                     goto free_buckets;
> >>>>>>
> >>>>>>               if (!percpu && !lru) {
> >>>>>>                       /* lru itself can remove the least used element, so
> >>>>>> @@ -568,12 +546,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>>>       } else {
> >>>>>>               err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
> >>>>>>               if (err)
> >>>>>> -                     goto free_map_locked;
> >>>>>> +                     goto free_buckets;
> >>>>>>               if (percpu) {
> >>>>>>                       err = bpf_mem_alloc_init(&htab->pcpu_ma,
> >>>>>>                                                round_up(htab->map.value_size, 8), true);
> >>>>>>                       if (err)
> >>>>>> -                             goto free_map_locked;
> >>>>>> +                             goto free_buckets;
> >>>>>>               }
> >>>>>>       }
> >>>>>>
> >>>>>> @@ -581,11 +559,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>>>>
> >>>>>>  free_prealloc:
> >>>>>>       prealloc_destroy(htab);
> >>>>>> -free_map_locked:
> >>>>>> +free_buckets:
> >>>>>>       if (htab->use_percpu_counter)
> >>>>>>               percpu_counter_destroy(&htab->pcount);
> >>>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
> >>>>>> -             free_percpu(htab->map_locked[i]);
> >>>>>> +
> >>>>>>       bpf_map_area_free(htab->buckets);
> >>>>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
> >>>>>>       bpf_mem_alloc_destroy(&htab->ma);
> >>>>>> @@ -782,7 +759,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >>>>>>       b = __select_bucket(htab, tgt_l->hash);
> >>>>>>       head = &b->head;
> >>>>>>
> >>>>>> -     ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
> >>>>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>>>       if (ret)
> >>>>>>               return false;
> >>>>>>
> >>>>>> @@ -793,7 +770,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >>>>>>                       break;
> >>>>>>               }
> >>>>>>
> >>>>>> -     htab_unlock_bucket(htab, b, tgt_l->hash, flags);
> >>>>>> +     htab_unlock_bucket(b, flags);
> >>>>>>
> >>>>>>       return l == tgt_l;
> >>>>>>  }
> >>>>>> @@ -1107,7 +1084,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> >>>>>>                */
> >>>>>>       }
> >>>>>>
> >>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>>>       if (ret)
> >>>>>>               return ret;
> >>>>>>
> >>>>>> @@ -1152,7 +1129,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> >>>>>>       }
> >>>>>>       ret = 0;
> >>>>>>  err:
> >>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>>>> +     htab_unlock_bucket(b, flags);
> >>>>>>       return ret;
> >>>>>>  }
> >>>>>>
> >>>>>> @@ -1198,7 +1175,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
> >>>>>>       copy_map_value(&htab->map,
> >>>>>>                      l_new->key + round_up(map->key_size, 8), value);
> >>>>>>
> >>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>>>       if (ret)
> >>>>>>               return ret;
> >>>>>>
> >>>>>> @@ -1219,7 +1196,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
> >>>>>>       ret = 0;
> >>>>>>
> >>>>>>  err:
> >>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>>>> +     htab_unlock_bucket(b, flags);
> >>>>>>
> >>>>>>       if (ret)
> >>>>>>               htab_lru_push_free(htab, l_new);
> >>>>>> @@ -1255,7 +1232,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
> >>>>>>       b = __select_bucket(htab, hash);
> >>>>>>       head = &b->head;
> >>>>>>
> >>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>>>       if (ret)
> >>>>>>               return ret;
> >>>>>>
> >>>>>> @@ -1280,7 +1257,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
> >>>>>>       }
> >>>>>>       ret = 0;
> >>>>>>  err:
> >>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>>>> +     htab_unlock_bucket(b, flags);
> >>>>>>       return ret;
> >>>>>>  }
> >>>>>>
> >>>>>> @@ -1321,7 +1298,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
> >>>>>>                       return -ENOMEM;
> >>>>>>       }
> >>>>>>
> >>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>>>       if (ret)
> >>>>>>               return ret;
> >>>>>>
> >>>>>> @@ -1345,7 +1322,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
> >>>>>>       }
> >>>>>>       ret = 0;
> >>>>>>  err:
> >>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>>>> +     htab_unlock_bucket(b, flags);
> >>>>>>       if (l_new)
> >>>>>>               bpf_lru_push_free(&htab->lru, &l_new->lru_node);
> >>>>>>       return ret;
> >>>>>> @@ -1384,7 +1361,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
> >>>>>>       b = __select_bucket(htab, hash);
> >>>>>>       head = &b->head;
> >>>>>>
> >>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>>>       if (ret)
> >>>>>>               return ret;
> >>>>>>
> >>>>>> @@ -1397,7 +1374,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
> >>>>>>               ret = -ENOENT;
> >>>>>>       }
> >>>>>>
> >>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>>>> +     htab_unlock_bucket(b, flags);
> >>>>>>       return ret;
> >>>>>>  }
> >>>>>>
> >>>>>> @@ -1420,7 +1397,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
> >>>>>>       b = __select_bucket(htab, hash);
> >>>>>>       head = &b->head;
> >>>>>>
> >>>>>> -     ret = htab_lock_bucket(htab, b, hash, &flags);
> >>>>>> +     ret = htab_lock_bucket(b, &flags);
> >>>>>>       if (ret)
> >>>>>>               return ret;
> >>>>>>
> >>>>>> @@ -1431,7 +1408,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
> >>>>>>       else
> >>>>>>               ret = -ENOENT;
> >>>>>>
> >>>>>> -     htab_unlock_bucket(htab, b, hash, flags);
> >>>>>> +     htab_unlock_bucket(b, flags);
> >>>>>>       if (l)
> >>>>>>               htab_lru_push_free(htab, l);
> >>>>>>       return ret;
> >>>>>> @@ -1494,7 +1471,6 @@ static void htab_map_free_timers(struct bpf_map *map)
> >>>>>>  static void htab_map_free(struct bpf_map *map)
> >>>>>>  {
> >>>>>>       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> >>>>>> -     int i;
> >>>>>>
> >>>>>>       /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
> >>>>>>        * bpf_free_used_maps() is called after bpf prog is no longer executing.
> >>>>>> @@ -1517,10 +1493,10 @@ static void htab_map_free(struct bpf_map *map)
> >>>>>>       bpf_map_area_free(htab->buckets);
> >>>>>>       bpf_mem_alloc_destroy(&htab->pcpu_ma);
> >>>>>>       bpf_mem_alloc_destroy(&htab->ma);
> >>>>>> +
> >>>>>>       if (htab->use_percpu_counter)
> >>>>>>               percpu_counter_destroy(&htab->pcount);
> >>>>>> -     for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
> >>>>>> -             free_percpu(htab->map_locked[i]);
> >>>>>> +
> >>>>>>       lockdep_unregister_key(&htab->lockdep_key);
> >>>>>>       bpf_map_area_free(htab);
> >>>>>>  }
> >>>>>> @@ -1564,7 +1540,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
> >>>>>>       b = __select_bucket(htab, hash);
> >>>>>>       head = &b->head;
> >>>>>>
> >>>>>> -     ret = htab_lock_bucket(htab, b, hash, &bflags);
> >>>>>> +     ret = htab_lock_bucket(b, &bflags);
> >>>>>>       if (ret)
> >>>>>>               return ret;
> >>>>>>
> >>>>>> @@ -1602,7 +1578,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
> >>>>>>                       free_htab_elem(htab, l);
> >>>>>>       }
> >>>>>>
> >>>>>> -     htab_unlock_bucket(htab, b, hash, bflags);
> >>>>>> +     htab_unlock_bucket(b, bflags);
> >>>>>>
> >>>>>>       if (is_lru_map && l)
> >>>>>>               htab_lru_push_free(htab, l);
> >>>>>> @@ -1720,7 +1696,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>>>       head = &b->head;
> >>>>>>       /* do not grab the lock unless need it (bucket_cnt > 0). */
> >>>>>>       if (locked) {
> >>>>>> -             ret = htab_lock_bucket(htab, b, batch, &flags);
> >>>>>> +             ret = htab_lock_bucket(b, &flags);
> >>>>>>               if (ret) {
> >>>>>>                       rcu_read_unlock();
> >>>>>>                       bpf_enable_instrumentation();
> >>>>>> @@ -1743,7 +1719,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>>>               /* Note that since bucket_cnt > 0 here, it is implicit
> >>>>>>                * that the locked was grabbed, so release it.
> >>>>>>                */
> >>>>>> -             htab_unlock_bucket(htab, b, batch, flags);
> >>>>>> +             htab_unlock_bucket(b, flags);
> >>>>>>               rcu_read_unlock();
> >>>>>>               bpf_enable_instrumentation();
> >>>>>>               goto after_loop;
> >>>>>> @@ -1754,7 +1730,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>>>               /* Note that since bucket_cnt > 0 here, it is implicit
> >>>>>>                * that the locked was grabbed, so release it.
> >>>>>>                */
> >>>>>> -             htab_unlock_bucket(htab, b, batch, flags);
> >>>>>> +             htab_unlock_bucket(b, flags);
> >>>>>>               rcu_read_unlock();
> >>>>>>               bpf_enable_instrumentation();
> >>>>>>               kvfree(keys);
> >>>>>> @@ -1815,7 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> >>>>>>               dst_val += value_size;
> >>>>>>       }
> >>>>>>
> >>>>>> -     htab_unlock_bucket(htab, b, batch, flags);
> >>>>>> +     htab_unlock_bucket(b, flags);
> >>>>>>       locked = false;
> >>>>>>
> >>>>>>       while (node_to_free) {
> >>> .
> >
>
Hao Luo Nov. 28, 2022, 9:55 p.m. UTC | #9
On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>

Hi Tonghao,

With a quick look at the htab_lock_bucket() and your problem
statement, I agree with Hou Tao that using hash &
min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
to fix the potential deadlock. Can you actually send your changes as
v2 so we can take a look and better help you? Also, can you explain
your solution in your commit message? Right now, your commit message
has only a problem statement and is not very clear. Please include
more details on what you do to fix the issue.

Hao

> Hi
> only a warning from lockdep.
> 1. the kernel .config
> #
> # Debug Oops, Lockups and Hangs
> #
> CONFIG_PANIC_ON_OOPS=y
> CONFIG_PANIC_ON_OOPS_VALUE=1
> CONFIG_PANIC_TIMEOUT=0
> CONFIG_LOCKUP_DETECTOR=y
> CONFIG_SOFTLOCKUP_DETECTOR=y
> # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
> CONFIG_HARDLOCKUP_DETECTOR_PERF=y
> CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y
> CONFIG_HARDLOCKUP_DETECTOR=y
> CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
> CONFIG_DETECT_HUNG_TASK=y
> CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
> # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
> # CONFIG_WQ_WATCHDOG is not set
> # CONFIG_TEST_LOCKUP is not set
> # end of Debug Oops, Lockups and Hangs
>
> 2. bpf.c, the map size is 2.
> struct {
> __uint(type, BPF_MAP_TYPE_HASH);
> __uint(max_entries, 2);
> __uint(key_size, sizeof(unsigned int));
> __uint(value_size, sizeof(unsigned int));
> } map1 SEC(".maps");
>
> static int bpf_update_data()
> {
> unsigned int val = 1, key = 0;
>
> return bpf_map_update_elem(&map1, &key, &val, BPF_ANY);
> }
>
> SEC("kprobe/ip_rcv")
> int bpf_prog1(struct pt_regs *regs)
> {
> bpf_update_data();
> return 0;
> }
>
> SEC("tracepoint/nmi/nmi_handler")
> int bpf_prog2(struct pt_regs *regs)
> {
> bpf_update_data();
> return 0;
> }
>
> char _license[] SEC("license") = "GPL";
> unsigned int _version SEC("version") = LINUX_VERSION_CODE;
>
> 3. bpf loader.
> #include "kprobe-example.skel.h"
>
> #include <unistd.h>
> #include <errno.h>
>
> #include <bpf/bpf.h>
>
> int main()
> {
> struct kprobe_example *skel;
> int map_fd, prog_fd;
> int i;
> int err = 0;
>
> skel = kprobe_example__open_and_load();
> if (!skel)
> return -1;
>
> err = kprobe_example__attach(skel);
> if (err)
> goto cleanup;
>
> /* all libbpf APIs are usable */
> prog_fd = bpf_program__fd(skel->progs.bpf_prog1);
> map_fd = bpf_map__fd(skel->maps.map1);
>
> printf("map_fd: %d\n", map_fd);
>
> unsigned int val = 0, key = 0;
>
> while (1) {
> bpf_map_delete_elem(map_fd, &key);
> bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
> }
>
> cleanup:
> kprobe_example__destroy(skel);
> return err;
> }
>
> 4. run the bpf loader and perf record for nmi interrupts.  the warming occurs
>
> --
> Best regards, Tonghao
Hou Tao Nov. 29, 2022, 4:32 a.m. UTC | #10
Hi,

On 11/29/2022 5:55 AM, Hao Luo wrote:
> On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> Hi Tonghao,
>
> With a quick look at the htab_lock_bucket() and your problem
> statement, I agree with Hou Tao that using hash &
> min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
> to fix the potential deadlock. Can you actually send your changes as
> v2 so we can take a look and better help you? Also, can you explain
> your solution in your commit message? Right now, your commit message
> has only a problem statement and is not very clear. Please include
> more details on what you do to fix the issue.
>
> Hao
It would be better if the test case below can be rewritten as a bpf selftests.
Please see comments below on how to improve it and reproduce the deadlock.
>
>> Hi
>> only a warning from lockdep.
Thanks for your details instruction.  I can reproduce the warning by using your
setup. I am not a lockdep expert, it seems that fixing such warning needs to set
different lockdep class to the different bucket. Because we use map_locked to
protect the acquisition of bucket lock, so I think we can define  lock_class_key
array in bpf_htab (e.g., lockdep_key[HASHTAB_MAP_LOCK_COUNT]) and initialize the
bucket lock accordingly.

>> 1. the kernel .config
>> #
>> # Debug Oops, Lockups and Hangs
>> #
>> CONFIG_PANIC_ON_OOPS=y
>> CONFIG_PANIC_ON_OOPS_VALUE=1
>> CONFIG_PANIC_TIMEOUT=0
>> CONFIG_LOCKUP_DETECTOR=y
>> CONFIG_SOFTLOCKUP_DETECTOR=y
>> # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
>> CONFIG_HARDLOCKUP_DETECTOR_PERF=y
>> CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y
>> CONFIG_HARDLOCKUP_DETECTOR=y
>> CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
>> CONFIG_DETECT_HUNG_TASK=y
>> CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
>> # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
>> # CONFIG_WQ_WATCHDOG is not set
>> # CONFIG_TEST_LOCKUP is not set
>> # end of Debug Oops, Lockups and Hangs
>>
>> 2. bpf.c, the map size is 2.
>> struct {
>> __uint(type, BPF_MAP_TYPE_HASH);
Adding __uint(map_flags, BPF_F_ZERO_SEED); to ensure there will be no seed for
hash calculation, so we can use key=4 and key=20 to construct the case that
these two keys have the same bucket index but have different map_locked index.
>> __uint(max_entries, 2);
>> __uint(key_size, sizeof(unsigned int));
>> __uint(value_size, sizeof(unsigned int));
>> } map1 SEC(".maps");
>>
>> static int bpf_update_data()
>> {
>> unsigned int val = 1, key = 0;
key = 20
>>
>> return bpf_map_update_elem(&map1, &key, &val, BPF_ANY);
>> }
>>
>> SEC("kprobe/ip_rcv")
>> int bpf_prog1(struct pt_regs *regs)
>> {
>> bpf_update_data();
>> return 0;
>> }
kprobe on ip_rcv is unnecessary, you can just remove it.
>>
>> SEC("tracepoint/nmi/nmi_handler")
>> int bpf_prog2(struct pt_regs *regs)
>> {
>> bpf_update_data();
>> return 0;
>> }
Please use SEC("fentry/nmi_handle") instead of SEC("tracepoint") and unfold
bpf_update_data(), because the running of bpf program on tracepoint will be
blocked by bpf_prog_active which will be increased bpf_map_update_elem through
bpf_disable_instrumentation().
>>
>> char _license[] SEC("license") = "GPL";
>> unsigned int _version SEC("version") = LINUX_VERSION_CODE;
>>
>> 3. bpf loader.
>> #include "kprobe-example.skel.h"
>>
>> #include <unistd.h>
>> #include <errno.h>
>>
>> #include <bpf/bpf.h>
>>
>> int main()
>> {
>> struct kprobe_example *skel;
>> int map_fd, prog_fd;
>> int i;
>> int err = 0;
>>
>> skel = kprobe_example__open_and_load();
>> if (!skel)
>> return -1;
>>
>> err = kprobe_example__attach(skel);
>> if (err)
>> goto cleanup;
>>
>> /* all libbpf APIs are usable */
>> prog_fd = bpf_program__fd(skel->progs.bpf_prog1);
>> map_fd = bpf_map__fd(skel->maps.map1);
>>
>> printf("map_fd: %d\n", map_fd);
>>
>> unsigned int val = 0, key = 0;
>>
>> while (1) {
>> bpf_map_delete_elem(map_fd, &key);
No needed neither. Only do bpf_map_update_elem() is OK. Also change key=0 from
key=4, so it will have the same bucket index as key=20 but have different
map_locked index.
>> bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
>> }
Also need to pin the process on a specific CPU (e.g., CPU 0)
>>
>> cleanup:
>> kprobe_example__destroy(skel);
>> return err;
>> }
>>
>> 4. run the bpf loader and perf record for nmi interrupts.  the warming occurs
For perf event, you can reference prog_tests/find_vma.c on how to using
perf_event_open to trigger a perf nmi interrupt. The perf event also needs to
pin on a specific CPU as the caller of bpf_map_update_elem() does.

>>
>> --
>> Best regards, Tonghao
> .
Tonghao Zhang Nov. 29, 2022, 6:06 a.m. UTC | #11
On Tue, Nov 29, 2022 at 12:32 PM Hou Tao <houtao1@huawei.com> wrote:
>
> Hi,
>
> On 11/29/2022 5:55 AM, Hao Luo wrote:
> > On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > Hi Tonghao,
> >
> > With a quick look at the htab_lock_bucket() and your problem
> > statement, I agree with Hou Tao that using hash &
> > min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
> > to fix the potential deadlock. Can you actually send your changes as
> > v2 so we can take a look and better help you? Also, can you explain
> > your solution in your commit message? Right now, your commit message
> > has only a problem statement and is not very clear. Please include
> > more details on what you do to fix the issue.
> >
> > Hao
> It would be better if the test case below can be rewritten as a bpf selftests.
> Please see comments below on how to improve it and reproduce the deadlock.
> >
> >> Hi
> >> only a warning from lockdep.
> Thanks for your details instruction.  I can reproduce the warning by using your
> setup. I am not a lockdep expert, it seems that fixing such warning needs to set
> different lockdep class to the different bucket. Because we use map_locked to
> protect the acquisition of bucket lock, so I think we can define  lock_class_key
> array in bpf_htab (e.g., lockdep_key[HASHTAB_MAP_LOCK_COUNT]) and initialize the
> bucket lock accordingly.
Hi
Thanks for your reply. define the lock_class_key array looks good.
Last question: how about using  raw_spin_trylock_irqsave, if the
bucket is locked on the same or other cpu.
raw_spin_trylock_irqsave will return the false, we should return the
-EBUSY in htab_lock_bucket.

static inline int htab_lock_bucket(struct bucket *b,
                                   unsigned long *pflags)
{
        unsigned long flags;

        if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
                return -EBUSY;

        *pflags = flags;
        return 0;
}

> >> 1. the kernel .config
> >> #
> >> # Debug Oops, Lockups and Hangs
> >> #
> >> CONFIG_PANIC_ON_OOPS=y
> >> CONFIG_PANIC_ON_OOPS_VALUE=1
> >> CONFIG_PANIC_TIMEOUT=0
> >> CONFIG_LOCKUP_DETECTOR=y
> >> CONFIG_SOFTLOCKUP_DETECTOR=y
> >> # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
> >> CONFIG_HARDLOCKUP_DETECTOR_PERF=y
> >> CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y
> >> CONFIG_HARDLOCKUP_DETECTOR=y
> >> CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
> >> CONFIG_DETECT_HUNG_TASK=y
> >> CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
> >> # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
> >> # CONFIG_WQ_WATCHDOG is not set
> >> # CONFIG_TEST_LOCKUP is not set
> >> # end of Debug Oops, Lockups and Hangs
> >>
> >> 2. bpf.c, the map size is 2.
> >> struct {
> >> __uint(type, BPF_MAP_TYPE_HASH);
> Adding __uint(map_flags, BPF_F_ZERO_SEED); to ensure there will be no seed for
> hash calculation, so we can use key=4 and key=20 to construct the case that
> these two keys have the same bucket index but have different map_locked index.
> >> __uint(max_entries, 2);
> >> __uint(key_size, sizeof(unsigned int));
> >> __uint(value_size, sizeof(unsigned int));
> >> } map1 SEC(".maps");
> >>
> >> static int bpf_update_data()
> >> {
> >> unsigned int val = 1, key = 0;
> key = 20
> >>
> >> return bpf_map_update_elem(&map1, &key, &val, BPF_ANY);
> >> }
> >>
> >> SEC("kprobe/ip_rcv")
> >> int bpf_prog1(struct pt_regs *regs)
> >> {
> >> bpf_update_data();
> >> return 0;
> >> }
> kprobe on ip_rcv is unnecessary, you can just remove it.
> >>
> >> SEC("tracepoint/nmi/nmi_handler")
> >> int bpf_prog2(struct pt_regs *regs)
> >> {
> >> bpf_update_data();
> >> return 0;
> >> }
> Please use SEC("fentry/nmi_handle") instead of SEC("tracepoint") and unfold
> bpf_update_data(), because the running of bpf program on tracepoint will be
> blocked by bpf_prog_active which will be increased bpf_map_update_elem through
> bpf_disable_instrumentation().
> >>
> >> char _license[] SEC("license") = "GPL";
> >> unsigned int _version SEC("version") = LINUX_VERSION_CODE;
> >>
> >> 3. bpf loader.
> >> #include "kprobe-example.skel.h"
> >>
> >> #include <unistd.h>
> >> #include <errno.h>
> >>
> >> #include <bpf/bpf.h>
> >>
> >> int main()
> >> {
> >> struct kprobe_example *skel;
> >> int map_fd, prog_fd;
> >> int i;
> >> int err = 0;
> >>
> >> skel = kprobe_example__open_and_load();
> >> if (!skel)
> >> return -1;
> >>
> >> err = kprobe_example__attach(skel);
> >> if (err)
> >> goto cleanup;
> >>
> >> /* all libbpf APIs are usable */
> >> prog_fd = bpf_program__fd(skel->progs.bpf_prog1);
> >> map_fd = bpf_map__fd(skel->maps.map1);
> >>
> >> printf("map_fd: %d\n", map_fd);
> >>
> >> unsigned int val = 0, key = 0;
> >>
> >> while (1) {
> >> bpf_map_delete_elem(map_fd, &key);
> No needed neither. Only do bpf_map_update_elem() is OK. Also change key=0 from
> key=4, so it will have the same bucket index as key=20 but have different
> map_locked index.
> >> bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
> >> }
> Also need to pin the process on a specific CPU (e.g., CPU 0)
> >>
> >> cleanup:
> >> kprobe_example__destroy(skel);
> >> return err;
> >> }
> >>
> >> 4. run the bpf loader and perf record for nmi interrupts.  the warming occurs
> For perf event, you can reference prog_tests/find_vma.c on how to using
> perf_event_open to trigger a perf nmi interrupt. The perf event also needs to
> pin on a specific CPU as the caller of bpf_map_update_elem() does.
>
> >>
> >> --
> >> Best regards, Tonghao
> > .
>
Hou Tao Nov. 29, 2022, 7:56 a.m. UTC | #12
Hi,

On 11/29/2022 2:06 PM, Tonghao Zhang wrote:
> On Tue, Nov 29, 2022 at 12:32 PM Hou Tao <houtao1@huawei.com> wrote:
>> Hi,
>>
>> On 11/29/2022 5:55 AM, Hao Luo wrote:
>>> On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>>> Hi Tonghao,
>>>
>>> With a quick look at the htab_lock_bucket() and your problem
>>> statement, I agree with Hou Tao that using hash &
>>> min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
>>> to fix the potential deadlock. Can you actually send your changes as
>>> v2 so we can take a look and better help you? Also, can you explain
>>> your solution in your commit message? Right now, your commit message
>>> has only a problem statement and is not very clear. Please include
>>> more details on what you do to fix the issue.
>>>
>>> Hao
>> It would be better if the test case below can be rewritten as a bpf selftests.
>> Please see comments below on how to improve it and reproduce the deadlock.
>>>> Hi
>>>> only a warning from lockdep.
>> Thanks for your details instruction.  I can reproduce the warning by using your
>> setup. I am not a lockdep expert, it seems that fixing such warning needs to set
>> different lockdep class to the different bucket. Because we use map_locked to
>> protect the acquisition of bucket lock, so I think we can define  lock_class_key
>> array in bpf_htab (e.g., lockdep_key[HASHTAB_MAP_LOCK_COUNT]) and initialize the
>> bucket lock accordingly.
> Hi
> Thanks for your reply. define the lock_class_key array looks good.
> Last question: how about using  raw_spin_trylock_irqsave, if the
> bucket is locked on the same or other cpu.
> raw_spin_trylock_irqsave will return the false, we should return the
> -EBUSY in htab_lock_bucket.
>
> static inline int htab_lock_bucket(struct bucket *b,
>                                    unsigned long *pflags)
> {
>         unsigned long flags;
>
>         if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>                 return -EBUSY;
>
>         *pflags = flags;
>         return 0;
> }
The flaw of trylock solution is that it can not distinguish between dead-lock
and lock with high contention. So I don't think it is a good idea to do that.
>
>>>> 1. the kernel .config
>>>> #
>>>> # Debug Oops, Lockups and Hangs
>>>> #
>>>> CONFIG_PANIC_ON_OOPS=y
>>>> CONFIG_PANIC_ON_OOPS_VALUE=1
>>>> CONFIG_PANIC_TIMEOUT=0
>>>> CONFIG_LOCKUP_DETECTOR=y
>>>> CONFIG_SOFTLOCKUP_DETECTOR=y
>>>> # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
>>>> CONFIG_HARDLOCKUP_DETECTOR_PERF=y
>>>> CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y
>>>> CONFIG_HARDLOCKUP_DETECTOR=y
>>>> CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
>>>> CONFIG_DETECT_HUNG_TASK=y
>>>> CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
>>>> # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
>>>> # CONFIG_WQ_WATCHDOG is not set
>>>> # CONFIG_TEST_LOCKUP is not set
>>>> # end of Debug Oops, Lockups and Hangs
>>>>
>>>> 2. bpf.c, the map size is 2.
>>>> struct {
>>>> __uint(type, BPF_MAP_TYPE_HASH);
>> Adding __uint(map_flags, BPF_F_ZERO_SEED); to ensure there will be no seed for
>> hash calculation, so we can use key=4 and key=20 to construct the case that
>> these two keys have the same bucket index but have different map_locked index.
>>>> __uint(max_entries, 2);
>>>> __uint(key_size, sizeof(unsigned int));
>>>> __uint(value_size, sizeof(unsigned int));
>>>> } map1 SEC(".maps");
>>>>
>>>> static int bpf_update_data()
>>>> {
>>>> unsigned int val = 1, key = 0;
>> key = 20
>>>> return bpf_map_update_elem(&map1, &key, &val, BPF_ANY);
>>>> }
>>>>
>>>> SEC("kprobe/ip_rcv")
>>>> int bpf_prog1(struct pt_regs *regs)
>>>> {
>>>> bpf_update_data();
>>>> return 0;
>>>> }
>> kprobe on ip_rcv is unnecessary, you can just remove it.
>>>> SEC("tracepoint/nmi/nmi_handler")
>>>> int bpf_prog2(struct pt_regs *regs)
>>>> {
>>>> bpf_update_data();
>>>> return 0;
>>>> }
>> Please use SEC("fentry/nmi_handle") instead of SEC("tracepoint") and unfold
>> bpf_update_data(), because the running of bpf program on tracepoint will be
>> blocked by bpf_prog_active which will be increased bpf_map_update_elem through
>> bpf_disable_instrumentation().
>>>> char _license[] SEC("license") = "GPL";
>>>> unsigned int _version SEC("version") = LINUX_VERSION_CODE;
>>>>
>>>> 3. bpf loader.
>>>> #include "kprobe-example.skel.h"
>>>>
>>>> #include <unistd.h>
>>>> #include <errno.h>
>>>>
>>>> #include <bpf/bpf.h>
>>>>
>>>> int main()
>>>> {
>>>> struct kprobe_example *skel;
>>>> int map_fd, prog_fd;
>>>> int i;
>>>> int err = 0;
>>>>
>>>> skel = kprobe_example__open_and_load();
>>>> if (!skel)
>>>> return -1;
>>>>
>>>> err = kprobe_example__attach(skel);
>>>> if (err)
>>>> goto cleanup;
>>>>
>>>> /* all libbpf APIs are usable */
>>>> prog_fd = bpf_program__fd(skel->progs.bpf_prog1);
>>>> map_fd = bpf_map__fd(skel->maps.map1);
>>>>
>>>> printf("map_fd: %d\n", map_fd);
>>>>
>>>> unsigned int val = 0, key = 0;
>>>>
>>>> while (1) {
>>>> bpf_map_delete_elem(map_fd, &key);
>> No needed neither. Only do bpf_map_update_elem() is OK. Also change key=0 from
>> key=4, so it will have the same bucket index as key=20 but have different
>> map_locked index.
>>>> bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
>>>> }
>> Also need to pin the process on a specific CPU (e.g., CPU 0)
>>>> cleanup:
>>>> kprobe_example__destroy(skel);
>>>> return err;
>>>> }
>>>>
>>>> 4. run the bpf loader and perf record for nmi interrupts.  the warming occurs
>> For perf event, you can reference prog_tests/find_vma.c on how to using
>> perf_event_open to trigger a perf nmi interrupt. The perf event also needs to
>> pin on a specific CPU as the caller of bpf_map_update_elem() does.
>>
>>>> --
>>>> Best regards, Tonghao
>>> .
>
Hou Tao Nov. 29, 2022, 12:45 p.m. UTC | #13
Hi,

On 11/29/2022 2:06 PM, Tonghao Zhang wrote:
> On Tue, Nov 29, 2022 at 12:32 PM Hou Tao <houtao1@huawei.com> wrote:
>> Hi,
>>
>> On 11/29/2022 5:55 AM, Hao Luo wrote:
>>> On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>>> Hi Tonghao,
>>>
>>> With a quick look at the htab_lock_bucket() and your problem
>>> statement, I agree with Hou Tao that using hash &
>>> min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
>>> to fix the potential deadlock. Can you actually send your changes as
>>> v2 so we can take a look and better help you? Also, can you explain
>>> your solution in your commit message? Right now, your commit message
>>> has only a problem statement and is not very clear. Please include
>>> more details on what you do to fix the issue.
>>>
>>> Hao
>> It would be better if the test case below can be rewritten as a bpf selftests.
>> Please see comments below on how to improve it and reproduce the deadlock.
>>>> Hi
>>>> only a warning from lockdep.
>> Thanks for your details instruction.  I can reproduce the warning by using your
>> setup. I am not a lockdep expert, it seems that fixing such warning needs to set
>> different lockdep class to the different bucket. Because we use map_locked to
>> protect the acquisition of bucket lock, so I think we can define  lock_class_key
>> array in bpf_htab (e.g., lockdep_key[HASHTAB_MAP_LOCK_COUNT]) and initialize the
>> bucket lock accordingly.
The proposed lockdep solution doesn't work. Still got lockdep warning after
that, so cc +locking expert +lkml.org for lockdep help.

Hi lockdep experts,

We are trying to fix the following lockdep warning from bpf subsystem:

[   36.092222] ================================
[   36.092230] WARNING: inconsistent lock state
[   36.092234] 6.1.0-rc5+ #81 Tainted: G            E
[   36.092236] --------------------------------
[   36.092237] inconsistent {INITIAL USE} -> {IN-NMI} usage.
[   36.092238] perf/1515 [HC1[1]:SC0[0]:HE0:SE1] takes:
[   36.092242] ffff888341acd1a0 (&htab->lockdep_key){....}-{2:2}, at:
htab_lock_bucket+0x4d/0x58
[   36.092253] {INITIAL USE} state was registered at:
[   36.092255]   mark_usage+0x1d/0x11d
[   36.092262]   __lock_acquire+0x3c9/0x6ed
[   36.092266]   lock_acquire+0x23d/0x29a
[   36.092270]   _raw_spin_lock_irqsave+0x43/0x7f
[   36.092274]   htab_lock_bucket+0x4d/0x58
[   36.092276]   htab_map_delete_elem+0x82/0xfb
[   36.092278]   map_delete_elem+0x156/0x1ac
[   36.092282]   __sys_bpf+0x138/0xb71
[   36.092285]   __do_sys_bpf+0xd/0x15
[   36.092288]   do_syscall_64+0x6d/0x84
[   36.092291]   entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   36.092295] irq event stamp: 120346
[   36.092296] hardirqs last  enabled at (120345): [<ffffffff8180b97f>]
_raw_spin_unlock_irq+0x24/0x39
[   36.092299] hardirqs last disabled at (120346): [<ffffffff81169e85>]
generic_exec_single+0x40/0xb9
[   36.092303] softirqs last  enabled at (120268): [<ffffffff81c00347>]
__do_softirq+0x347/0x387
[   36.092307] softirqs last disabled at (120133): [<ffffffff810ba4f0>]
__irq_exit_rcu+0x67/0xc6
[   36.092311]
[   36.092311] other info that might help us debug this:
[   36.092312]  Possible unsafe locking scenario:
[   36.092312]
[   36.092313]        CPU0
[   36.092313]        ----
[   36.092314]   lock(&htab->lockdep_key);
[   36.092315]   <Interrupt>
[   36.092316]     lock(&htab->lockdep_key);
[   36.092318]
[   36.092318]  *** DEADLOCK ***
[   36.092318]
[   36.092318] 3 locks held by perf/1515:
[   36.092320]  #0: ffff8881b9805cc0 (&cpuctx_mutex){+.+.}-{4:4}, at:
perf_event_ctx_lock_nested+0x8e/0xba
[   36.092327]  #1: ffff8881075ecc20 (&event->child_mutex){+.+.}-{4:4}, at:
perf_event_for_each_child+0x35/0x76
[   36.092332]  #2: ffff8881b9805c20 (&cpuctx_lock){-.-.}-{2:2}, at:
perf_ctx_lock+0x12/0x27
[   36.092339]
[   36.092339] stack backtrace:
[   36.092341] CPU: 0 PID: 1515 Comm: perf Tainted: G            E     
6.1.0-rc5+ #81
[   36.092344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[   36.092349] Call Trace:
[   36.092351]  <NMI>
[   36.092354]  dump_stack_lvl+0x57/0x81
[   36.092359]  lock_acquire+0x1f4/0x29a
[   36.092363]  ? handle_pmi_common+0x13f/0x1f0
[   36.092366]  ? htab_lock_bucket+0x4d/0x58
[   36.092371]  _raw_spin_lock_irqsave+0x43/0x7f
[   36.092374]  ? htab_lock_bucket+0x4d/0x58
[   36.092377]  htab_lock_bucket+0x4d/0x58
[   36.092379]  htab_map_update_elem+0x11e/0x220
[   36.092386]  bpf_prog_f3a535ca81a8128a_bpf_prog2+0x3e/0x42
[   36.092392]  trace_call_bpf+0x177/0x215
[   36.092398]  perf_trace_run_bpf_submit+0x52/0xaa
[   36.092403]  ? x86_pmu_stop+0x97/0x97
[   36.092407]  perf_trace_nmi_handler+0xb7/0xe0
[   36.092415]  nmi_handle+0x116/0x254
[   36.092418]  ? x86_pmu_stop+0x97/0x97
[   36.092423]  default_do_nmi+0x3d/0xf6
[   36.092428]  exc_nmi+0xa1/0x109
[   36.092432]  end_repeat_nmi+0x16/0x67
[   36.092436] RIP: 0010:wrmsrl+0xd/0x1b
[   36.092441] Code: 04 01 00 00 c6 84 07 48 01 00 00 01 5b e9 46 15 80 00 5b c3
cc cc cc cc c3 cc cc cc cc 48 89 f2 89 f9 89 f0 48 c1 ea 20 0f 30 <66> 90 c3 cc
cc cc cc 31 d2 e9 2f 04 49 00 0f 1f 44 00 00 40 0f6
[   36.092443] RSP: 0018:ffffc900043dfc48 EFLAGS: 00000002
[   36.092445] RAX: 000000000000000f RBX: ffff8881b96153e0 RCX: 000000000000038f
[   36.092447] RDX: 0000000000000007 RSI: 000000070000000f RDI: 000000000000038f
[   36.092449] RBP: 000000070000000f R08: ffffffffffffffff R09: ffff8881053bdaa8
[   36.092451] R10: ffff8881b9805d40 R11: 0000000000000005 R12: ffff8881b9805c00
[   36.092452] R13: 0000000000000000 R14: 0000000000000000 R15: ffff8881075ec970
[   36.092460]  ? wrmsrl+0xd/0x1b
[   36.092465]  ? wrmsrl+0xd/0x1b
[   36.092469]  </NMI>
[   36.092469]  <TASK>
[   36.092470]  __intel_pmu_enable_all.constprop.0+0x7c/0xaf
[   36.092475]  event_function+0xb6/0xd3
[   36.092478]  ? cpu_to_node+0x1a/0x1a
[   36.092482]  ? cpu_to_node+0x1a/0x1a
[   36.092485]  remote_function+0x1e/0x4c
[   36.092489]  generic_exec_single+0x48/0xb9
[   36.092492]  ? __lock_acquire+0x666/0x6ed
[   36.092497]  smp_call_function_single+0xbf/0x106
[   36.092499]  ? cpu_to_node+0x1a/0x1a
[   36.092504]  ? kvm_sched_clock_read+0x5/0x11
[   36.092508]  ? __perf_event_task_sched_in+0x13d/0x13d
[   36.092513]  cpu_function_call+0x47/0x69
[   36.092516]  ? perf_event_update_time+0x52/0x52
[   36.092519]  event_function_call+0x89/0x117
[   36.092521]  ? __perf_event_task_sched_in+0x13d/0x13d
[   36.092526]  ? _perf_event_disable+0x4a/0x4a
[   36.092528]  perf_event_for_each_child+0x3d/0x76
[   36.092532]  ? _perf_event_disable+0x4a/0x4a
[   36.092533]  _perf_ioctl+0x564/0x590
[   36.092537]  ? __lock_release+0xd5/0x1b0
[   36.092543]  ? perf_event_ctx_lock_nested+0x8e/0xba
[   36.092547]  perf_ioctl+0x42/0x5f
[   36.092551]  vfs_ioctl+0x1e/0x2f
[   36.092554]  __do_sys_ioctl+0x66/0x89
[   36.092559]  do_syscall_64+0x6d/0x84
[   36.092563]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   36.092566] RIP: 0033:0x7fe7110f362b
[   36.092569] Code: 0f 1e fa 48 8b 05 5d b8 2c 00 64 c7 00 26 00 00 00 48 c7 c0
ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0
ff ff 73 01 c3 48 8b 0d 2d b8 2c 00 f7 d8 64 89 018
[   36.092570] RSP: 002b:00007ffebb8e4b08 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
[   36.092573] RAX: ffffffffffffffda RBX: 0000000000002400 RCX: 00007fe7110f362b
[   36.092575] RDX: 0000000000000000 RSI: 0000000000002400 RDI: 0000000000000013
[   36.092576] RBP: 00007ffebb8e4b40 R08: 0000000000000001 R09: 000055c1db4a5b40
[   36.092577] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[   36.092579] R13: 000055c1db3b2a30 R14: 0000000000000000 R15: 0000000000000000
[   36.092586]  </TASK>

The lockdep warning is a false alarm, because per-cpu map_locked must be zero
before acquire b->raw_lock. If b->raw_lock has already been acquired by a normal
process through htab_map_update_elem(), then a NMI interrupts the process and
tries to acquire the same b->raw_lock, the acquisition will fail because per-cpu
map_locked has already been increased by the process.

So beside using lockdep_off() and lockdep_on() to disable/enable lockdep
temporarily in htab_lock_bucket() and htab_unlock_bucket(), are there other ways
to fix the lockdep warning ?

Thanks,
Tao




> Hi
> Thanks for your reply. define the lock_class_key array looks good.
> Last question: how about using  raw_spin_trylock_irqsave, if the
> bucket is locked on the same or other cpu.
> raw_spin_trylock_irqsave will return the false, we should return the
> -EBUSY in htab_lock_bucket.
>
> static inline int htab_lock_bucket(struct bucket *b,
>                                    unsigned long *pflags)
> {
>         unsigned long flags;
>
>         if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>                 return -EBUSY;
>
>         *pflags = flags;
>         return 0;
> }
>
>>>> 1. the kernel .config
>>>> #
>>>> # Debug Oops, Lockups and Hangs
>>>> #
>>>> CONFIG_PANIC_ON_OOPS=y
>>>> CONFIG_PANIC_ON_OOPS_VALUE=1
>>>> CONFIG_PANIC_TIMEOUT=0
>>>> CONFIG_LOCKUP_DETECTOR=y
>>>> CONFIG_SOFTLOCKUP_DETECTOR=y
>>>> # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
>>>> CONFIG_HARDLOCKUP_DETECTOR_PERF=y
>>>> CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y
>>>> CONFIG_HARDLOCKUP_DETECTOR=y
>>>> CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
>>>> CONFIG_DETECT_HUNG_TASK=y
>>>> CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
>>>> # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
>>>> # CONFIG_WQ_WATCHDOG is not set
>>>> # CONFIG_TEST_LOCKUP is not set
>>>> # end of Debug Oops, Lockups and Hangs
>>>>
>>>> 2. bpf.c, the map size is 2.
>>>> struct {
>>>> __uint(type, BPF_MAP_TYPE_HASH);
>> Adding __uint(map_flags, BPF_F_ZERO_SEED); to ensure there will be no seed for
>> hash calculation, so we can use key=4 and key=20 to construct the case that
>> these two keys have the same bucket index but have different map_locked index.
>>>> __uint(max_entries, 2);
>>>> __uint(key_size, sizeof(unsigned int));
>>>> __uint(value_size, sizeof(unsigned int));
>>>> } map1 SEC(".maps");
>>>>
>>>> static int bpf_update_data()
>>>> {
>>>> unsigned int val = 1, key = 0;
>> key = 20
>>>> return bpf_map_update_elem(&map1, &key, &val, BPF_ANY);
>>>> }
>>>>
>>>> SEC("kprobe/ip_rcv")
>>>> int bpf_prog1(struct pt_regs *regs)
>>>> {
>>>> bpf_update_data();
>>>> return 0;
>>>> }
>> kprobe on ip_rcv is unnecessary, you can just remove it.
>>>> SEC("tracepoint/nmi/nmi_handler")
>>>> int bpf_prog2(struct pt_regs *regs)
>>>> {
>>>> bpf_update_data();
>>>> return 0;
>>>> }
>> Please use SEC("fentry/nmi_handle") instead of SEC("tracepoint") and unfold
>> bpf_update_data(), because the running of bpf program on tracepoint will be
>> blocked by bpf_prog_active which will be increased bpf_map_update_elem through
>> bpf_disable_instrumentation().
>>>> char _license[] SEC("license") = "GPL";
>>>> unsigned int _version SEC("version") = LINUX_VERSION_CODE;
>>>>
>>>> 3. bpf loader.
>>>> #include "kprobe-example.skel.h"
>>>>
>>>> #include <unistd.h>
>>>> #include <errno.h>
>>>>
>>>> #include <bpf/bpf.h>
>>>>
>>>> int main()
>>>> {
>>>> struct kprobe_example *skel;
>>>> int map_fd, prog_fd;
>>>> int i;
>>>> int err = 0;
>>>>
>>>> skel = kprobe_example__open_and_load();
>>>> if (!skel)
>>>> return -1;
>>>>
>>>> err = kprobe_example__attach(skel);
>>>> if (err)
>>>> goto cleanup;
>>>>
>>>> /* all libbpf APIs are usable */
>>>> prog_fd = bpf_program__fd(skel->progs.bpf_prog1);
>>>> map_fd = bpf_map__fd(skel->maps.map1);
>>>>
>>>> printf("map_fd: %d\n", map_fd);
>>>>
>>>> unsigned int val = 0, key = 0;
>>>>
>>>> while (1) {
>>>> bpf_map_delete_elem(map_fd, &key);
>> No needed neither. Only do bpf_map_update_elem() is OK. Also change key=0 from
>> key=4, so it will have the same bucket index as key=20 but have different
>> map_locked index.
>>>> bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
>>>> }
>> Also need to pin the process on a specific CPU (e.g., CPU 0)
>>>> cleanup:
>>>> kprobe_example__destroy(skel);
>>>> return err;
>>>> }
>>>>
>>>> 4. run the bpf loader and perf record for nmi interrupts.  the warming occurs
>> For perf event, you can reference prog_tests/find_vma.c on how to using
>> perf_event_open to trigger a perf nmi interrupt. The perf event also needs to
>> pin on a specific CPU as the caller of bpf_map_update_elem() does.
>>
>>>> --
>>>> Best regards, Tonghao
>>> .
>
Waiman Long Nov. 29, 2022, 4:06 p.m. UTC | #14
On 11/29/22 07:45, Hou Tao wrote:
> Hi,
>
> On 11/29/2022 2:06 PM, Tonghao Zhang wrote:
>> On Tue, Nov 29, 2022 at 12:32 PM Hou Tao <houtao1@huawei.com> wrote:
>>> Hi,
>>>
>>> On 11/29/2022 5:55 AM, Hao Luo wrote:
>>>> On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>>>> Hi Tonghao,
>>>>
>>>> With a quick look at the htab_lock_bucket() and your problem
>>>> statement, I agree with Hou Tao that using hash &
>>>> min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
>>>> to fix the potential deadlock. Can you actually send your changes as
>>>> v2 so we can take a look and better help you? Also, can you explain
>>>> your solution in your commit message? Right now, your commit message
>>>> has only a problem statement and is not very clear. Please include
>>>> more details on what you do to fix the issue.
>>>>
>>>> Hao
>>> It would be better if the test case below can be rewritten as a bpf selftests.
>>> Please see comments below on how to improve it and reproduce the deadlock.
>>>>> Hi
>>>>> only a warning from lockdep.
>>> Thanks for your details instruction.  I can reproduce the warning by using your
>>> setup. I am not a lockdep expert, it seems that fixing such warning needs to set
>>> different lockdep class to the different bucket. Because we use map_locked to
>>> protect the acquisition of bucket lock, so I think we can define  lock_class_key
>>> array in bpf_htab (e.g., lockdep_key[HASHTAB_MAP_LOCK_COUNT]) and initialize the
>>> bucket lock accordingly.
> The proposed lockdep solution doesn't work. Still got lockdep warning after
> that, so cc +locking expert +lkml.org for lockdep help.
>
> Hi lockdep experts,
>
> We are trying to fix the following lockdep warning from bpf subsystem:
>
> [   36.092222] ================================
> [   36.092230] WARNING: inconsistent lock state
> [   36.092234] 6.1.0-rc5+ #81 Tainted: G            E
> [   36.092236] --------------------------------
> [   36.092237] inconsistent {INITIAL USE} -> {IN-NMI} usage.
> [   36.092238] perf/1515 [HC1[1]:SC0[0]:HE0:SE1] takes:
> [   36.092242] ffff888341acd1a0 (&htab->lockdep_key){....}-{2:2}, at:
> htab_lock_bucket+0x4d/0x58
> [   36.092253] {INITIAL USE} state was registered at:
> [   36.092255]   mark_usage+0x1d/0x11d
> [   36.092262]   __lock_acquire+0x3c9/0x6ed
> [   36.092266]   lock_acquire+0x23d/0x29a
> [   36.092270]   _raw_spin_lock_irqsave+0x43/0x7f
> [   36.092274]   htab_lock_bucket+0x4d/0x58
> [   36.092276]   htab_map_delete_elem+0x82/0xfb
> [   36.092278]   map_delete_elem+0x156/0x1ac
> [   36.092282]   __sys_bpf+0x138/0xb71
> [   36.092285]   __do_sys_bpf+0xd/0x15
> [   36.092288]   do_syscall_64+0x6d/0x84
> [   36.092291]   entry_SYSCALL_64_after_hwframe+0x63/0xcd
> [   36.092295] irq event stamp: 120346
> [   36.092296] hardirqs last  enabled at (120345): [<ffffffff8180b97f>]
> _raw_spin_unlock_irq+0x24/0x39
> [   36.092299] hardirqs last disabled at (120346): [<ffffffff81169e85>]
> generic_exec_single+0x40/0xb9
> [   36.092303] softirqs last  enabled at (120268): [<ffffffff81c00347>]
> __do_softirq+0x347/0x387
> [   36.092307] softirqs last disabled at (120133): [<ffffffff810ba4f0>]
> __irq_exit_rcu+0x67/0xc6
> [   36.092311]
> [   36.092311] other info that might help us debug this:
> [   36.092312]  Possible unsafe locking scenario:
> [   36.092312]
> [   36.092313]        CPU0
> [   36.092313]        ----
> [   36.092314]   lock(&htab->lockdep_key);
> [   36.092315]   <Interrupt>
> [   36.092316]     lock(&htab->lockdep_key);
> [   36.092318]
> [   36.092318]  *** DEADLOCK ***
> [   36.092318]
> [   36.092318] 3 locks held by perf/1515:
> [   36.092320]  #0: ffff8881b9805cc0 (&cpuctx_mutex){+.+.}-{4:4}, at:
> perf_event_ctx_lock_nested+0x8e/0xba
> [   36.092327]  #1: ffff8881075ecc20 (&event->child_mutex){+.+.}-{4:4}, at:
> perf_event_for_each_child+0x35/0x76
> [   36.092332]  #2: ffff8881b9805c20 (&cpuctx_lock){-.-.}-{2:2}, at:
> perf_ctx_lock+0x12/0x27
> [   36.092339]
> [   36.092339] stack backtrace:
> [   36.092341] CPU: 0 PID: 1515 Comm: perf Tainted: G            E
> 6.1.0-rc5+ #81
> [   36.092344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> [   36.092349] Call Trace:
> [   36.092351]  <NMI>
> [   36.092354]  dump_stack_lvl+0x57/0x81
> [   36.092359]  lock_acquire+0x1f4/0x29a
> [   36.092363]  ? handle_pmi_common+0x13f/0x1f0
> [   36.092366]  ? htab_lock_bucket+0x4d/0x58
> [   36.092371]  _raw_spin_lock_irqsave+0x43/0x7f
> [   36.092374]  ? htab_lock_bucket+0x4d/0x58
> [   36.092377]  htab_lock_bucket+0x4d/0x58
> [   36.092379]  htab_map_update_elem+0x11e/0x220
> [   36.092386]  bpf_prog_f3a535ca81a8128a_bpf_prog2+0x3e/0x42
> [   36.092392]  trace_call_bpf+0x177/0x215
> [   36.092398]  perf_trace_run_bpf_submit+0x52/0xaa
> [   36.092403]  ? x86_pmu_stop+0x97/0x97
> [   36.092407]  perf_trace_nmi_handler+0xb7/0xe0
> [   36.092415]  nmi_handle+0x116/0x254
> [   36.092418]  ? x86_pmu_stop+0x97/0x97
> [   36.092423]  default_do_nmi+0x3d/0xf6
> [   36.092428]  exc_nmi+0xa1/0x109
> [   36.092432]  end_repeat_nmi+0x16/0x67
> [   36.092436] RIP: 0010:wrmsrl+0xd/0x1b

So the lock is really taken in a NMI context. In general, we advise 
again using lock in a NMI context unless it is a lock that is used only 
in that context. Otherwise, deadlock is certainly a possibility as there 
is no way to mask off again NMI.

Cheers,
Longman
Boqun Feng Nov. 29, 2022, 5:23 p.m. UTC | #15
On Tue, Nov 29, 2022 at 11:06:51AM -0500, Waiman Long wrote:
> On 11/29/22 07:45, Hou Tao wrote:
> > Hi,
> > 
> > On 11/29/2022 2:06 PM, Tonghao Zhang wrote:
> > > On Tue, Nov 29, 2022 at 12:32 PM Hou Tao <houtao1@huawei.com> wrote:
> > > > Hi,
> > > > 
> > > > On 11/29/2022 5:55 AM, Hao Luo wrote:
> > > > > On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > > Hi Tonghao,
> > > > > 
> > > > > With a quick look at the htab_lock_bucket() and your problem
> > > > > statement, I agree with Hou Tao that using hash &
> > > > > min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
> > > > > to fix the potential deadlock. Can you actually send your changes as
> > > > > v2 so we can take a look and better help you? Also, can you explain
> > > > > your solution in your commit message? Right now, your commit message
> > > > > has only a problem statement and is not very clear. Please include
> > > > > more details on what you do to fix the issue.
> > > > > 
> > > > > Hao
> > > > It would be better if the test case below can be rewritten as a bpf selftests.
> > > > Please see comments below on how to improve it and reproduce the deadlock.
> > > > > > Hi
> > > > > > only a warning from lockdep.
> > > > Thanks for your details instruction.  I can reproduce the warning by using your
> > > > setup. I am not a lockdep expert, it seems that fixing such warning needs to set
> > > > different lockdep class to the different bucket. Because we use map_locked to
> > > > protect the acquisition of bucket lock, so I think we can define  lock_class_key
> > > > array in bpf_htab (e.g., lockdep_key[HASHTAB_MAP_LOCK_COUNT]) and initialize the
> > > > bucket lock accordingly.
> > The proposed lockdep solution doesn't work. Still got lockdep warning after
> > that, so cc +locking expert +lkml.org for lockdep help.
> > 
> > Hi lockdep experts,
> > 
> > We are trying to fix the following lockdep warning from bpf subsystem:
> > 
> > [   36.092222] ================================
> > [   36.092230] WARNING: inconsistent lock state
> > [   36.092234] 6.1.0-rc5+ #81 Tainted: G            E
> > [   36.092236] --------------------------------
> > [   36.092237] inconsistent {INITIAL USE} -> {IN-NMI} usage.
> > [   36.092238] perf/1515 [HC1[1]:SC0[0]:HE0:SE1] takes:
> > [   36.092242] ffff888341acd1a0 (&htab->lockdep_key){....}-{2:2}, at:
> > htab_lock_bucket+0x4d/0x58
> > [   36.092253] {INITIAL USE} state was registered at:
> > [   36.092255]   mark_usage+0x1d/0x11d
> > [   36.092262]   __lock_acquire+0x3c9/0x6ed
> > [   36.092266]   lock_acquire+0x23d/0x29a
> > [   36.092270]   _raw_spin_lock_irqsave+0x43/0x7f
> > [   36.092274]   htab_lock_bucket+0x4d/0x58
> > [   36.092276]   htab_map_delete_elem+0x82/0xfb
> > [   36.092278]   map_delete_elem+0x156/0x1ac
> > [   36.092282]   __sys_bpf+0x138/0xb71
> > [   36.092285]   __do_sys_bpf+0xd/0x15
> > [   36.092288]   do_syscall_64+0x6d/0x84
> > [   36.092291]   entry_SYSCALL_64_after_hwframe+0x63/0xcd
> > [   36.092295] irq event stamp: 120346
> > [   36.092296] hardirqs last  enabled at (120345): [<ffffffff8180b97f>]
> > _raw_spin_unlock_irq+0x24/0x39
> > [   36.092299] hardirqs last disabled at (120346): [<ffffffff81169e85>]
> > generic_exec_single+0x40/0xb9
> > [   36.092303] softirqs last  enabled at (120268): [<ffffffff81c00347>]
> > __do_softirq+0x347/0x387
> > [   36.092307] softirqs last disabled at (120133): [<ffffffff810ba4f0>]
> > __irq_exit_rcu+0x67/0xc6
> > [   36.092311]
> > [   36.092311] other info that might help us debug this:
> > [   36.092312]  Possible unsafe locking scenario:
> > [   36.092312]
> > [   36.092313]        CPU0
> > [   36.092313]        ----
> > [   36.092314]   lock(&htab->lockdep_key);
> > [   36.092315]   <Interrupt>
> > [   36.092316]     lock(&htab->lockdep_key);
> > [   36.092318]
> > [   36.092318]  *** DEADLOCK ***
> > [   36.092318]
> > [   36.092318] 3 locks held by perf/1515:
> > [   36.092320]  #0: ffff8881b9805cc0 (&cpuctx_mutex){+.+.}-{4:4}, at:
> > perf_event_ctx_lock_nested+0x8e/0xba
> > [   36.092327]  #1: ffff8881075ecc20 (&event->child_mutex){+.+.}-{4:4}, at:
> > perf_event_for_each_child+0x35/0x76
> > [   36.092332]  #2: ffff8881b9805c20 (&cpuctx_lock){-.-.}-{2:2}, at:
> > perf_ctx_lock+0x12/0x27
> > [   36.092339]
> > [   36.092339] stack backtrace:
> > [   36.092341] CPU: 0 PID: 1515 Comm: perf Tainted: G            E
> > 6.1.0-rc5+ #81
> > [   36.092344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> > rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> > [   36.092349] Call Trace:
> > [   36.092351]  <NMI>
> > [   36.092354]  dump_stack_lvl+0x57/0x81
> > [   36.092359]  lock_acquire+0x1f4/0x29a
> > [   36.092363]  ? handle_pmi_common+0x13f/0x1f0
> > [   36.092366]  ? htab_lock_bucket+0x4d/0x58
> > [   36.092371]  _raw_spin_lock_irqsave+0x43/0x7f
> > [   36.092374]  ? htab_lock_bucket+0x4d/0x58
> > [   36.092377]  htab_lock_bucket+0x4d/0x58
> > [   36.092379]  htab_map_update_elem+0x11e/0x220
> > [   36.092386]  bpf_prog_f3a535ca81a8128a_bpf_prog2+0x3e/0x42
> > [   36.092392]  trace_call_bpf+0x177/0x215
> > [   36.092398]  perf_trace_run_bpf_submit+0x52/0xaa
> > [   36.092403]  ? x86_pmu_stop+0x97/0x97
> > [   36.092407]  perf_trace_nmi_handler+0xb7/0xe0
> > [   36.092415]  nmi_handle+0x116/0x254
> > [   36.092418]  ? x86_pmu_stop+0x97/0x97
> > [   36.092423]  default_do_nmi+0x3d/0xf6
> > [   36.092428]  exc_nmi+0xa1/0x109
> > [   36.092432]  end_repeat_nmi+0x16/0x67
> > [   36.092436] RIP: 0010:wrmsrl+0xd/0x1b
> 
> So the lock is really taken in a NMI context. In general, we advise again
> using lock in a NMI context unless it is a lock that is used only in that
> context. Otherwise, deadlock is certainly a possibility as there is no way
> to mask off again NMI.
> 

I think here they use a percpu counter as an "outer lock" to make the
accesses to the real lock exclusive:

	preempt_disable();
	a = __this_cpu_inc(->map_locked);
	if (a != 1) {
		__this_cpu_dec(->map_locked);
		preempt_enable();
		return -EBUSY;
	}
	preempt_enable();
		return -EBUSY;
	
	raw_spin_lock_irqsave(->raw_lock);

and lockdep is not aware that ->map_locked acts as a lock.

However, I feel this may be just a reinvented try_lock pattern, Hou Tao,
could you see if this can be refactored with a try_lock? Otherwise, you
may need to introduce a virtual lockclass for ->map_locked.

Regards,
Boqun

> Cheers,
> Longman
>
Boqun Feng Nov. 29, 2022, 5:32 p.m. UTC | #16
On Tue, Nov 29, 2022 at 09:23:18AM -0800, Boqun Feng wrote:
> On Tue, Nov 29, 2022 at 11:06:51AM -0500, Waiman Long wrote:
> > On 11/29/22 07:45, Hou Tao wrote:
> > > Hi,
> > > 
> > > On 11/29/2022 2:06 PM, Tonghao Zhang wrote:
> > > > On Tue, Nov 29, 2022 at 12:32 PM Hou Tao <houtao1@huawei.com> wrote:
> > > > > Hi,
> > > > > 
> > > > > On 11/29/2022 5:55 AM, Hao Luo wrote:
> > > > > > On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > > > Hi Tonghao,
> > > > > > 
> > > > > > With a quick look at the htab_lock_bucket() and your problem
> > > > > > statement, I agree with Hou Tao that using hash &
> > > > > > min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
> > > > > > to fix the potential deadlock. Can you actually send your changes as
> > > > > > v2 so we can take a look and better help you? Also, can you explain
> > > > > > your solution in your commit message? Right now, your commit message
> > > > > > has only a problem statement and is not very clear. Please include
> > > > > > more details on what you do to fix the issue.
> > > > > > 
> > > > > > Hao
> > > > > It would be better if the test case below can be rewritten as a bpf selftests.
> > > > > Please see comments below on how to improve it and reproduce the deadlock.
> > > > > > > Hi
> > > > > > > only a warning from lockdep.
> > > > > Thanks for your details instruction.  I can reproduce the warning by using your
> > > > > setup. I am not a lockdep expert, it seems that fixing such warning needs to set
> > > > > different lockdep class to the different bucket. Because we use map_locked to
> > > > > protect the acquisition of bucket lock, so I think we can define  lock_class_key
> > > > > array in bpf_htab (e.g., lockdep_key[HASHTAB_MAP_LOCK_COUNT]) and initialize the
> > > > > bucket lock accordingly.
> > > The proposed lockdep solution doesn't work. Still got lockdep warning after
> > > that, so cc +locking expert +lkml.org for lockdep help.
> > > 
> > > Hi lockdep experts,
> > > 
> > > We are trying to fix the following lockdep warning from bpf subsystem:
> > > 
> > > [   36.092222] ================================
> > > [   36.092230] WARNING: inconsistent lock state
> > > [   36.092234] 6.1.0-rc5+ #81 Tainted: G            E
> > > [   36.092236] --------------------------------
> > > [   36.092237] inconsistent {INITIAL USE} -> {IN-NMI} usage.
> > > [   36.092238] perf/1515 [HC1[1]:SC0[0]:HE0:SE1] takes:
> > > [   36.092242] ffff888341acd1a0 (&htab->lockdep_key){....}-{2:2}, at:
> > > htab_lock_bucket+0x4d/0x58
> > > [   36.092253] {INITIAL USE} state was registered at:
> > > [   36.092255]   mark_usage+0x1d/0x11d
> > > [   36.092262]   __lock_acquire+0x3c9/0x6ed
> > > [   36.092266]   lock_acquire+0x23d/0x29a
> > > [   36.092270]   _raw_spin_lock_irqsave+0x43/0x7f
> > > [   36.092274]   htab_lock_bucket+0x4d/0x58
> > > [   36.092276]   htab_map_delete_elem+0x82/0xfb
> > > [   36.092278]   map_delete_elem+0x156/0x1ac
> > > [   36.092282]   __sys_bpf+0x138/0xb71
> > > [   36.092285]   __do_sys_bpf+0xd/0x15
> > > [   36.092288]   do_syscall_64+0x6d/0x84
> > > [   36.092291]   entry_SYSCALL_64_after_hwframe+0x63/0xcd
> > > [   36.092295] irq event stamp: 120346
> > > [   36.092296] hardirqs last  enabled at (120345): [<ffffffff8180b97f>]
> > > _raw_spin_unlock_irq+0x24/0x39
> > > [   36.092299] hardirqs last disabled at (120346): [<ffffffff81169e85>]
> > > generic_exec_single+0x40/0xb9
> > > [   36.092303] softirqs last  enabled at (120268): [<ffffffff81c00347>]
> > > __do_softirq+0x347/0x387
> > > [   36.092307] softirqs last disabled at (120133): [<ffffffff810ba4f0>]
> > > __irq_exit_rcu+0x67/0xc6
> > > [   36.092311]
> > > [   36.092311] other info that might help us debug this:
> > > [   36.092312]  Possible unsafe locking scenario:
> > > [   36.092312]
> > > [   36.092313]        CPU0
> > > [   36.092313]        ----
> > > [   36.092314]   lock(&htab->lockdep_key);
> > > [   36.092315]   <Interrupt>
> > > [   36.092316]     lock(&htab->lockdep_key);
> > > [   36.092318]
> > > [   36.092318]  *** DEADLOCK ***
> > > [   36.092318]
> > > [   36.092318] 3 locks held by perf/1515:
> > > [   36.092320]  #0: ffff8881b9805cc0 (&cpuctx_mutex){+.+.}-{4:4}, at:
> > > perf_event_ctx_lock_nested+0x8e/0xba
> > > [   36.092327]  #1: ffff8881075ecc20 (&event->child_mutex){+.+.}-{4:4}, at:
> > > perf_event_for_each_child+0x35/0x76
> > > [   36.092332]  #2: ffff8881b9805c20 (&cpuctx_lock){-.-.}-{2:2}, at:
> > > perf_ctx_lock+0x12/0x27
> > > [   36.092339]
> > > [   36.092339] stack backtrace:
> > > [   36.092341] CPU: 0 PID: 1515 Comm: perf Tainted: G            E
> > > 6.1.0-rc5+ #81
> > > [   36.092344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> > > rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> > > [   36.092349] Call Trace:
> > > [   36.092351]  <NMI>
> > > [   36.092354]  dump_stack_lvl+0x57/0x81
> > > [   36.092359]  lock_acquire+0x1f4/0x29a
> > > [   36.092363]  ? handle_pmi_common+0x13f/0x1f0
> > > [   36.092366]  ? htab_lock_bucket+0x4d/0x58
> > > [   36.092371]  _raw_spin_lock_irqsave+0x43/0x7f
> > > [   36.092374]  ? htab_lock_bucket+0x4d/0x58
> > > [   36.092377]  htab_lock_bucket+0x4d/0x58
> > > [   36.092379]  htab_map_update_elem+0x11e/0x220
> > > [   36.092386]  bpf_prog_f3a535ca81a8128a_bpf_prog2+0x3e/0x42
> > > [   36.092392]  trace_call_bpf+0x177/0x215
> > > [   36.092398]  perf_trace_run_bpf_submit+0x52/0xaa
> > > [   36.092403]  ? x86_pmu_stop+0x97/0x97
> > > [   36.092407]  perf_trace_nmi_handler+0xb7/0xe0
> > > [   36.092415]  nmi_handle+0x116/0x254
> > > [   36.092418]  ? x86_pmu_stop+0x97/0x97
> > > [   36.092423]  default_do_nmi+0x3d/0xf6
> > > [   36.092428]  exc_nmi+0xa1/0x109
> > > [   36.092432]  end_repeat_nmi+0x16/0x67
> > > [   36.092436] RIP: 0010:wrmsrl+0xd/0x1b
> > 
> > So the lock is really taken in a NMI context. In general, we advise again
> > using lock in a NMI context unless it is a lock that is used only in that
> > context. Otherwise, deadlock is certainly a possibility as there is no way
> > to mask off again NMI.
> > 
> 
> I think here they use a percpu counter as an "outer lock" to make the
> accesses to the real lock exclusive:
> 
> 	preempt_disable();
> 	a = __this_cpu_inc(->map_locked);
> 	if (a != 1) {
> 		__this_cpu_dec(->map_locked);
> 		preempt_enable();
> 		return -EBUSY;
> 	}
> 	preempt_enable();
> 		return -EBUSY;
> 	
> 	raw_spin_lock_irqsave(->raw_lock);
> 
> and lockdep is not aware that ->map_locked acts as a lock.
> 
> However, I feel this may be just a reinvented try_lock pattern, Hou Tao,
> could you see if this can be refactored with a try_lock? Otherwise, you

Just to be clear, I meant to refactor htab_lock_bucket() into a try
lock pattern. Also after a second thought, the below suggestion doesn't
work. I think the proper way is to make htab_lock_bucket() as a
raw_spin_trylock_irqsave().

Regards,
Boqun

> may need to introduce a virtual lockclass for ->map_locked.
> 
> Regards,
> Boqun
> 
> > Cheers,
> > Longman
> >
Hao Luo Nov. 29, 2022, 7:36 p.m. UTC | #17
On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>
> Just to be clear, I meant to refactor htab_lock_bucket() into a try
> lock pattern. Also after a second thought, the below suggestion doesn't
> work. I think the proper way is to make htab_lock_bucket() as a
> raw_spin_trylock_irqsave().
>
> Regards,
> Boqun
>

The potential deadlock happens when the lock is contended from the
same cpu. When the lock is contended from a remote cpu, we would like
the remote cpu to spin and wait, instead of giving up immediately. As
this gives better throughput. So replacing the current
raw_spin_lock_irqsave() with trylock sacrifices this performance gain.

I suspect the source of the problem is the 'hash' that we used in
htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
whether we should use a hash derived from 'bucket' rather than from
'key'. For example, from the memory address of the 'bucket'. Because,
different keys may fall into the same bucket, but yield different
hashes. If the same bucket can never have two different 'hashes' here,
the map_locked check should behave as intended. Also because
->map_locked is per-cpu, execution flows from two different cpus can
both pass.

Hao
Waiman Long Nov. 29, 2022, 9:13 p.m. UTC | #18
On 11/29/22 14:36, Hao Luo wrote:
> On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>> Just to be clear, I meant to refactor htab_lock_bucket() into a try
>> lock pattern. Also after a second thought, the below suggestion doesn't
>> work. I think the proper way is to make htab_lock_bucket() as a
>> raw_spin_trylock_irqsave().
>>
>> Regards,
>> Boqun
>>
> The potential deadlock happens when the lock is contended from the
> same cpu. When the lock is contended from a remote cpu, we would like
> the remote cpu to spin and wait, instead of giving up immediately. As
> this gives better throughput. So replacing the current
> raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
>
> I suspect the source of the problem is the 'hash' that we used in
> htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
> whether we should use a hash derived from 'bucket' rather than from
> 'key'. For example, from the memory address of the 'bucket'. Because,
> different keys may fall into the same bucket, but yield different
> hashes. If the same bucket can never have two different 'hashes' here,
> the map_locked check should behave as intended. Also because
> ->map_locked is per-cpu, execution flows from two different cpus can
> both pass.

I would suggest that you add a in_nmi() check and if true use trylock to 
get the lock. You can continue to use raw_spin_lock_irqsave() in all 
other cases.

Cheers,
Longman
Hou Tao Nov. 30, 2022, 1:37 a.m. UTC | #19
Hi,

On 11/30/2022 1:23 AM, Boqun Feng wrote:
> On Tue, Nov 29, 2022 at 11:06:51AM -0500, Waiman Long wrote:
>> On 11/29/22 07:45, Hou Tao wrote:
>>> Hi,
>>>
>>> On 11/29/2022 2:06 PM, Tonghao Zhang wrote:
>>>> On Tue, Nov 29, 2022 at 12:32 PM Hou Tao <houtao1@huawei.com> wrote:
>>>>> Hi,
>>>>>
>>>>> On 11/29/2022 5:55 AM, Hao Luo wrote:
>>>>>> On Sun, Nov 27, 2022 at 7:15 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>>>>>> Hi Tonghao,
>>>>>>
>>>>>> With a quick look at the htab_lock_bucket() and your problem
>>>>>> statement, I agree with Hou Tao that using hash &
>>>>>> min(HASHTAB_MAP_LOCK_MASK, n_bucket - 1) to index in map_locked seems
>>>>>> to fix the potential deadlock. Can you actually send your changes as
>>>>>> v2 so we can take a look and better help you? Also, can you explain
>>>>>> your solution in your commit message? Right now, your commit message
>>>>>> has only a problem statement and is not very clear. Please include
>>>>>> more details on what you do to fix the issue.
>>>>>>
>>>>>> Hao
>>>>> It would be better if the test case below can be rewritten as a bpf selftests.
>>>>> Please see comments below on how to improve it and reproduce the deadlock.
>>>>>>> Hi
>>>>>>> only a warning from lockdep.
>>>>> Thanks for your details instruction.  I can reproduce the warning by using your
>>>>> setup. I am not a lockdep expert, it seems that fixing such warning needs to set
>>>>> different lockdep class to the different bucket. Because we use map_locked to
>>>>> protect the acquisition of bucket lock, so I think we can define  lock_class_key
>>>>> array in bpf_htab (e.g., lockdep_key[HASHTAB_MAP_LOCK_COUNT]) and initialize the
>>>>> bucket lock accordingly.
>>> The proposed lockdep solution doesn't work. Still got lockdep warning after
>>> that, so cc +locking expert +lkml.org for lockdep help.
>>>
>>> Hi lockdep experts,
>>>
>>> We are trying to fix the following lockdep warning from bpf subsystem:
>>>
>>> [   36.092222] ================================
>>> [   36.092230] WARNING: inconsistent lock state
>>> [   36.092234] 6.1.0-rc5+ #81 Tainted: G            E
>>> [   36.092236] --------------------------------
>>> [   36.092237] inconsistent {INITIAL USE} -> {IN-NMI} usage.
>>> [   36.092238] perf/1515 [HC1[1]:SC0[0]:HE0:SE1] takes:
>>> [   36.092242] ffff888341acd1a0 (&htab->lockdep_key){....}-{2:2}, at:
>>> htab_lock_bucket+0x4d/0x58
>>> [   36.092253] {INITIAL USE} state was registered at:
>>> [   36.092255]   mark_usage+0x1d/0x11d
>>> [   36.092262]   __lock_acquire+0x3c9/0x6ed
>>> [   36.092266]   lock_acquire+0x23d/0x29a
>>> [   36.092270]   _raw_spin_lock_irqsave+0x43/0x7f
>>> [   36.092274]   htab_lock_bucket+0x4d/0x58
>>> [   36.092276]   htab_map_delete_elem+0x82/0xfb
>>> [   36.092278]   map_delete_elem+0x156/0x1ac
>>> [   36.092282]   __sys_bpf+0x138/0xb71
>>> [   36.092285]   __do_sys_bpf+0xd/0x15
>>> [   36.092288]   do_syscall_64+0x6d/0x84
>>> [   36.092291]   entry_SYSCALL_64_after_hwframe+0x63/0xcd
>>> [   36.092295] irq event stamp: 120346
>>> [   36.092296] hardirqs last  enabled at (120345): [<ffffffff8180b97f>]
>>> _raw_spin_unlock_irq+0x24/0x39
>>> [   36.092299] hardirqs last disabled at (120346): [<ffffffff81169e85>]
>>> generic_exec_single+0x40/0xb9
>>> [   36.092303] softirqs last  enabled at (120268): [<ffffffff81c00347>]
>>> __do_softirq+0x347/0x387
>>> [   36.092307] softirqs last disabled at (120133): [<ffffffff810ba4f0>]
>>> __irq_exit_rcu+0x67/0xc6
>>> [   36.092311]
>>> [   36.092311] other info that might help us debug this:
>>> [   36.092312]  Possible unsafe locking scenario:
>>> [   36.092312]
>>> [   36.092313]        CPU0
>>> [   36.092313]        ----
>>> [   36.092314]   lock(&htab->lockdep_key);
>>> [   36.092315]   <Interrupt>
>>> [   36.092316]     lock(&htab->lockdep_key);
>>> [   36.092318]
>>> [   36.092318]  *** DEADLOCK ***
>>> [   36.092318]
>>> [   36.092318] 3 locks held by perf/1515:
>>> [   36.092320]  #0: ffff8881b9805cc0 (&cpuctx_mutex){+.+.}-{4:4}, at:
>>> perf_event_ctx_lock_nested+0x8e/0xba
>>> [   36.092327]  #1: ffff8881075ecc20 (&event->child_mutex){+.+.}-{4:4}, at:
>>> perf_event_for_each_child+0x35/0x76
>>> [   36.092332]  #2: ffff8881b9805c20 (&cpuctx_lock){-.-.}-{2:2}, at:
>>> perf_ctx_lock+0x12/0x27
>>> [   36.092339]
>>> [   36.092339] stack backtrace:
>>> [   36.092341] CPU: 0 PID: 1515 Comm: perf Tainted: G            E
>>> 6.1.0-rc5+ #81
>>> [   36.092344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
>>> rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
>>> [   36.092349] Call Trace:
>>> [   36.092351]  <NMI>
>>> [   36.092354]  dump_stack_lvl+0x57/0x81
>>> [   36.092359]  lock_acquire+0x1f4/0x29a
>>> [   36.092363]  ? handle_pmi_common+0x13f/0x1f0
>>> [   36.092366]  ? htab_lock_bucket+0x4d/0x58
>>> [   36.092371]  _raw_spin_lock_irqsave+0x43/0x7f
>>> [   36.092374]  ? htab_lock_bucket+0x4d/0x58
>>> [   36.092377]  htab_lock_bucket+0x4d/0x58
>>> [   36.092379]  htab_map_update_elem+0x11e/0x220
>>> [   36.092386]  bpf_prog_f3a535ca81a8128a_bpf_prog2+0x3e/0x42
>>> [   36.092392]  trace_call_bpf+0x177/0x215
>>> [   36.092398]  perf_trace_run_bpf_submit+0x52/0xaa
>>> [   36.092403]  ? x86_pmu_stop+0x97/0x97
>>> [   36.092407]  perf_trace_nmi_handler+0xb7/0xe0
>>> [   36.092415]  nmi_handle+0x116/0x254
>>> [   36.092418]  ? x86_pmu_stop+0x97/0x97
>>> [   36.092423]  default_do_nmi+0x3d/0xf6
>>> [   36.092428]  exc_nmi+0xa1/0x109
>>> [   36.092432]  end_repeat_nmi+0x16/0x67
>>> [   36.092436] RIP: 0010:wrmsrl+0xd/0x1b
>> So the lock is really taken in a NMI context. In general, we advise again
>> using lock in a NMI context unless it is a lock that is used only in that
>> context. Otherwise, deadlock is certainly a possibility as there is no way
>> to mask off again NMI.
>>
> I think here they use a percpu counter as an "outer lock" to make the
> accesses to the real lock exclusive:
>
> 	preempt_disable();
> 	a = __this_cpu_inc(->map_locked);
> 	if (a != 1) {
> 		__this_cpu_dec(->map_locked);
> 		preempt_enable();
> 		return -EBUSY;
> 	}
> 	preempt_enable();
> 		return -EBUSY;
> 	
> 	raw_spin_lock_irqsave(->raw_lock);
>
> and lockdep is not aware that ->map_locked acts as a lock.
>
> However, I feel this may be just a reinvented try_lock pattern, Hou Tao,
> could you see if this can be refactored with a try_lock? Otherwise, you
> may need to introduce a virtual lockclass for ->map_locked.
As said by Hao Luo, the problem of using trylock in nmi context is that it can
not distinguish between dead-lock and lock with high-contention. And map_locked
is still needed even trylock is used in NMI because htab_map_update_elem() may
be reentered in a normal context through attaching a bpf program to one function
called after taken the lock. So introducing a virtual lockclass for ->map_locked
is a better idea.

Thanks,
Tao
> Regards,
> Boqun
>
>> Cheers,
>> Longman
>>
> .
Hou Tao Nov. 30, 2022, 1:50 a.m. UTC | #20
Hi Hao,

On 11/30/2022 3:36 AM, Hao Luo wrote:
> On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>> Just to be clear, I meant to refactor htab_lock_bucket() into a try
>> lock pattern. Also after a second thought, the below suggestion doesn't
>> work. I think the proper way is to make htab_lock_bucket() as a
>> raw_spin_trylock_irqsave().
>>
>> Regards,
>> Boqun
>>
> The potential deadlock happens when the lock is contended from the
> same cpu. When the lock is contended from a remote cpu, we would like
> the remote cpu to spin and wait, instead of giving up immediately. As
> this gives better throughput. So replacing the current
> raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
>
> I suspect the source of the problem is the 'hash' that we used in
> htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
> whether we should use a hash derived from 'bucket' rather than from
> 'key'. For example, from the memory address of the 'bucket'. Because,
> different keys may fall into the same bucket, but yield different
> hashes. If the same bucket can never have two different 'hashes' here,
> the map_locked check should behave as intended. Also because
> ->map_locked is per-cpu, execution flows from two different cpus can
> both pass.
The warning from lockdep is due to the reason the bucket lock A is used in a
no-NMI context firstly, then the same bucke lock is used a NMI context, so
lockdep deduces that may be a dead-lock. I have already tried to use the same
map_locked for keys with the same bucket, the dead-lock is gone, but still got
lockdep warning.
>
> Hao
> .
Tonghao Zhang Nov. 30, 2022, 2:47 a.m. UTC | #21
On Wed, Nov 30, 2022 at 9:50 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi Hao,
>
> On 11/30/2022 3:36 AM, Hao Luo wrote:
> > On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
> >> Just to be clear, I meant to refactor htab_lock_bucket() into a try
> >> lock pattern. Also after a second thought, the below suggestion doesn't
> >> work. I think the proper way is to make htab_lock_bucket() as a
> >> raw_spin_trylock_irqsave().
> >>
> >> Regards,
> >> Boqun
> >>
> > The potential deadlock happens when the lock is contended from the
> > same cpu. When the lock is contended from a remote cpu, we would like
> > the remote cpu to spin and wait, instead of giving up immediately. As
> > this gives better throughput. So replacing the current
> > raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
> >
> > I suspect the source of the problem is the 'hash' that we used in
> > htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
> > whether we should use a hash derived from 'bucket' rather than from
> > 'key'. For example, from the memory address of the 'bucket'. Because,
> > different keys may fall into the same bucket, but yield different
> > hashes. If the same bucket can never have two different 'hashes' here,
> > the map_locked check should behave as intended. Also because
> > ->map_locked is per-cpu, execution flows from two different cpus can
> > both pass.
> The warning from lockdep is due to the reason the bucket lock A is used in a
> no-NMI context firstly, then the same bucke lock is used a NMI context, so
Yes, I tested lockdep too, we can't use the lock in NMI(but only
try_lock work fine) context if we use them no-NMI context. otherwise
the lockdep prints the warning.
* for the dead-lock case: we can use the
1. hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1)
2. or hash bucket address.

* for lockdep warning, we should use in_nmi check with map_locked.

BTW, the patch doesn't work, so we can remove the lock_key
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c50eb518e262fa06bd334e6eec172eaf5d7a5bd9

static inline int htab_lock_bucket(const struct bpf_htab *htab,
                                   struct bucket *b, u32 hash,
                                   unsigned long *pflags)
{
        unsigned long flags;

        hash = hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1);

        preempt_disable();
        if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
                __this_cpu_dec(*(htab->map_locked[hash]));
                preempt_enable();
                return -EBUSY;
        }

        if (in_nmi()) {
                if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
                        return -EBUSY;
        } else {
                raw_spin_lock_irqsave(&b->raw_lock, flags);
        }

        *pflags = flags;
        return 0;
}


> lockdep deduces that may be a dead-lock. I have already tried to use the same
> map_locked for keys with the same bucket, the dead-lock is gone, but still got
> lockdep warning.
> >
> > Hao
> > .
>
Waiman Long Nov. 30, 2022, 3:06 a.m. UTC | #22
On 11/29/22 21:47, Tonghao Zhang wrote:
> On Wed, Nov 30, 2022 at 9:50 AM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi Hao,
>>
>> On 11/30/2022 3:36 AM, Hao Luo wrote:
>>> On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>>>> Just to be clear, I meant to refactor htab_lock_bucket() into a try
>>>> lock pattern. Also after a second thought, the below suggestion doesn't
>>>> work. I think the proper way is to make htab_lock_bucket() as a
>>>> raw_spin_trylock_irqsave().
>>>>
>>>> Regards,
>>>> Boqun
>>>>
>>> The potential deadlock happens when the lock is contended from the
>>> same cpu. When the lock is contended from a remote cpu, we would like
>>> the remote cpu to spin and wait, instead of giving up immediately. As
>>> this gives better throughput. So replacing the current
>>> raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
>>>
>>> I suspect the source of the problem is the 'hash' that we used in
>>> htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
>>> whether we should use a hash derived from 'bucket' rather than from
>>> 'key'. For example, from the memory address of the 'bucket'. Because,
>>> different keys may fall into the same bucket, but yield different
>>> hashes. If the same bucket can never have two different 'hashes' here,
>>> the map_locked check should behave as intended. Also because
>>> ->map_locked is per-cpu, execution flows from two different cpus can
>>> both pass.
>> The warning from lockdep is due to the reason the bucket lock A is used in a
>> no-NMI context firstly, then the same bucke lock is used a NMI context, so
> Yes, I tested lockdep too, we can't use the lock in NMI(but only
> try_lock work fine) context if we use them no-NMI context. otherwise
> the lockdep prints the warning.
> * for the dead-lock case: we can use the
> 1. hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1)
> 2. or hash bucket address.
>
> * for lockdep warning, we should use in_nmi check with map_locked.
>
> BTW, the patch doesn't work, so we can remove the lock_key
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c50eb518e262fa06bd334e6eec172eaf5d7a5bd9
>
> static inline int htab_lock_bucket(const struct bpf_htab *htab,
>                                     struct bucket *b, u32 hash,
>                                     unsigned long *pflags)
> {
>          unsigned long flags;
>
>          hash = hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1);
>
>          preempt_disable();
>          if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
>                  __this_cpu_dec(*(htab->map_locked[hash]));
>                  preempt_enable();
>                  return -EBUSY;
>          }
>
>          if (in_nmi()) {
>                  if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>                          return -EBUSY;
That is not right. You have to do the same step as above by decrementing 
the percpu count and enable preemption. So you may want to put all these 
busy_out steps after the return 0 and use "goto busy_out;" to jump there.
>          } else {
>                  raw_spin_lock_irqsave(&b->raw_lock, flags);
>          }
>
>          *pflags = flags;
>          return 0;
> }

BTW, with that change, I believe you can actually remove all the percpu 
map_locked count code.

Cheers,
Longman
Tonghao Zhang Nov. 30, 2022, 3:32 a.m. UTC | #23
On Wed, Nov 30, 2022 at 11:07 AM Waiman Long <longman@redhat.com> wrote:
>
> On 11/29/22 21:47, Tonghao Zhang wrote:
> > On Wed, Nov 30, 2022 at 9:50 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >> Hi Hao,
> >>
> >> On 11/30/2022 3:36 AM, Hao Luo wrote:
> >>> On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
> >>>> Just to be clear, I meant to refactor htab_lock_bucket() into a try
> >>>> lock pattern. Also after a second thought, the below suggestion doesn't
> >>>> work. I think the proper way is to make htab_lock_bucket() as a
> >>>> raw_spin_trylock_irqsave().
> >>>>
> >>>> Regards,
> >>>> Boqun
> >>>>
> >>> The potential deadlock happens when the lock is contended from the
> >>> same cpu. When the lock is contended from a remote cpu, we would like
> >>> the remote cpu to spin and wait, instead of giving up immediately. As
> >>> this gives better throughput. So replacing the current
> >>> raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
> >>>
> >>> I suspect the source of the problem is the 'hash' that we used in
> >>> htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
> >>> whether we should use a hash derived from 'bucket' rather than from
> >>> 'key'. For example, from the memory address of the 'bucket'. Because,
> >>> different keys may fall into the same bucket, but yield different
> >>> hashes. If the same bucket can never have two different 'hashes' here,
> >>> the map_locked check should behave as intended. Also because
> >>> ->map_locked is per-cpu, execution flows from two different cpus can
> >>> both pass.
> >> The warning from lockdep is due to the reason the bucket lock A is used in a
> >> no-NMI context firstly, then the same bucke lock is used a NMI context, so
> > Yes, I tested lockdep too, we can't use the lock in NMI(but only
> > try_lock work fine) context if we use them no-NMI context. otherwise
> > the lockdep prints the warning.
> > * for the dead-lock case: we can use the
> > 1. hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1)
> > 2. or hash bucket address.
> >
> > * for lockdep warning, we should use in_nmi check with map_locked.
> >
> > BTW, the patch doesn't work, so we can remove the lock_key
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c50eb518e262fa06bd334e6eec172eaf5d7a5bd9
> >
> > static inline int htab_lock_bucket(const struct bpf_htab *htab,
> >                                     struct bucket *b, u32 hash,
> >                                     unsigned long *pflags)
> > {
> >          unsigned long flags;
> >
> >          hash = hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1);
> >
> >          preempt_disable();
> >          if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
> >                  __this_cpu_dec(*(htab->map_locked[hash]));
> >                  preempt_enable();
> >                  return -EBUSY;
> >          }
> >
> >          if (in_nmi()) {
> >                  if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
> >                          return -EBUSY;
> That is not right. You have to do the same step as above by decrementing
> the percpu count and enable preemption. So you may want to put all these
> busy_out steps after the return 0 and use "goto busy_out;" to jump there.
Yes, thanks Waiman, I should add the busy_out label.
> >          } else {
> >                  raw_spin_lock_irqsave(&b->raw_lock, flags);
> >          }
> >
> >          *pflags = flags;
> >          return 0;
> > }
>
> BTW, with that change, I believe you can actually remove all the percpu
> map_locked count code.
there are some case, for example, we run the bpf_prog A B in task
context on the same cpu.
bpf_prog A
update map X
    htab_lock_bucket
        raw_spin_lock_irqsave()
    lookup_elem_raw()
        // bpf prog B is attached on lookup_elem_raw()
        bpf prog B
            update map X again and update the element
                htab_lock_bucket()
                    // dead-lock
                    raw_spinlock_irqsave()
> Cheers,
> Longman
>
Waiman Long Nov. 30, 2022, 4:07 a.m. UTC | #24
On 11/29/22 22:32, Tonghao Zhang wrote:
> On Wed, Nov 30, 2022 at 11:07 AM Waiman Long <longman@redhat.com> wrote:
>> On 11/29/22 21:47, Tonghao Zhang wrote:
>>> On Wed, Nov 30, 2022 at 9:50 AM Hou Tao <houtao@huaweicloud.com> wrote:
>>>> Hi Hao,
>>>>
>>>> On 11/30/2022 3:36 AM, Hao Luo wrote:
>>>>> On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>>>>>> Just to be clear, I meant to refactor htab_lock_bucket() into a try
>>>>>> lock pattern. Also after a second thought, the below suggestion doesn't
>>>>>> work. I think the proper way is to make htab_lock_bucket() as a
>>>>>> raw_spin_trylock_irqsave().
>>>>>>
>>>>>> Regards,
>>>>>> Boqun
>>>>>>
>>>>> The potential deadlock happens when the lock is contended from the
>>>>> same cpu. When the lock is contended from a remote cpu, we would like
>>>>> the remote cpu to spin and wait, instead of giving up immediately. As
>>>>> this gives better throughput. So replacing the current
>>>>> raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
>>>>>
>>>>> I suspect the source of the problem is the 'hash' that we used in
>>>>> htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
>>>>> whether we should use a hash derived from 'bucket' rather than from
>>>>> 'key'. For example, from the memory address of the 'bucket'. Because,
>>>>> different keys may fall into the same bucket, but yield different
>>>>> hashes. If the same bucket can never have two different 'hashes' here,
>>>>> the map_locked check should behave as intended. Also because
>>>>> ->map_locked is per-cpu, execution flows from two different cpus can
>>>>> both pass.
>>>> The warning from lockdep is due to the reason the bucket lock A is used in a
>>>> no-NMI context firstly, then the same bucke lock is used a NMI context, so
>>> Yes, I tested lockdep too, we can't use the lock in NMI(but only
>>> try_lock work fine) context if we use them no-NMI context. otherwise
>>> the lockdep prints the warning.
>>> * for the dead-lock case: we can use the
>>> 1. hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1)
>>> 2. or hash bucket address.
>>>
>>> * for lockdep warning, we should use in_nmi check with map_locked.
>>>
>>> BTW, the patch doesn't work, so we can remove the lock_key
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c50eb518e262fa06bd334e6eec172eaf5d7a5bd9
>>>
>>> static inline int htab_lock_bucket(const struct bpf_htab *htab,
>>>                                      struct bucket *b, u32 hash,
>>>                                      unsigned long *pflags)
>>> {
>>>           unsigned long flags;
>>>
>>>           hash = hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1);
>>>
>>>           preempt_disable();
>>>           if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
>>>                   __this_cpu_dec(*(htab->map_locked[hash]));
>>>                   preempt_enable();
>>>                   return -EBUSY;
>>>           }
>>>
>>>           if (in_nmi()) {
>>>                   if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>>>                           return -EBUSY;
>> That is not right. You have to do the same step as above by decrementing
>> the percpu count and enable preemption. So you may want to put all these
>> busy_out steps after the return 0 and use "goto busy_out;" to jump there.
> Yes, thanks Waiman, I should add the busy_out label.
>>>           } else {
>>>                   raw_spin_lock_irqsave(&b->raw_lock, flags);
>>>           }
>>>
>>>           *pflags = flags;
>>>           return 0;
>>> }
>> BTW, with that change, I believe you can actually remove all the percpu
>> map_locked count code.
> there are some case, for example, we run the bpf_prog A B in task
> context on the same cpu.
> bpf_prog A
> update map X
>      htab_lock_bucket
>          raw_spin_lock_irqsave()
>      lookup_elem_raw()
>          // bpf prog B is attached on lookup_elem_raw()
>          bpf prog B
>              update map X again and update the element
>                  htab_lock_bucket()
>                      // dead-lock
>                      raw_spinlock_irqsave()

I see, so nested locking is possible in this case. Beside using the 
percpu map_lock, another way is to have cpumask associated with each 
bucket lock and use each bit in the cpumask for to control access using 
test_and_set_bit() for each cpu. That will allow more concurrency and 
you can actually find out how contended is the lock. Anyway, it is just 
a thought.

Cheers,
Longman
Hou Tao Nov. 30, 2022, 4:13 a.m. UTC | #25
Hi,

On 11/30/2022 10:47 AM, Tonghao Zhang wrote:
> On Wed, Nov 30, 2022 at 9:50 AM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi Hao,
>>
>> On 11/30/2022 3:36 AM, Hao Luo wrote:
>>> On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>>>> Just to be clear, I meant to refactor htab_lock_bucket() into a try
>>>> lock pattern. Also after a second thought, the below suggestion doesn't
>>>> work. I think the proper way is to make htab_lock_bucket() as a
>>>> raw_spin_trylock_irqsave().
>>>>
>>>> Regards,
>>>> Boqun
>>>>
>>> The potential deadlock happens when the lock is contended from the
>>> same cpu. When the lock is contended from a remote cpu, we would like
>>> the remote cpu to spin and wait, instead of giving up immediately. As
>>> this gives better throughput. So replacing the current
>>> raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
>>>
>>> I suspect the source of the problem is the 'hash' that we used in
>>> htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
>>> whether we should use a hash derived from 'bucket' rather than from
>>> 'key'. For example, from the memory address of the 'bucket'. Because,
>>> different keys may fall into the same bucket, but yield different
>>> hashes. If the same bucket can never have two different 'hashes' here,
>>> the map_locked check should behave as intended. Also because
>>> ->map_locked is per-cpu, execution flows from two different cpus can
>>> both pass.
>> The warning from lockdep is due to the reason the bucket lock A is used in a
>> no-NMI context firstly, then the same bucke lock is used a NMI context, so
> Yes, I tested lockdep too, we can't use the lock in NMI(but only
> try_lock work fine) context if we use them no-NMI context. otherwise
> the lockdep prints the warning.
> * for the dead-lock case: we can use the
> 1. hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1)
> 2. or hash bucket address.
Use the computed hash will be better than hash bucket address, because the hash
buckets are allocated sequentially.
>
> * for lockdep warning, we should use in_nmi check with map_locked.
>
> BTW, the patch doesn't work, so we can remove the lock_key
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c50eb518e262fa06bd334e6eec172eaf5d7a5bd9
>
> static inline int htab_lock_bucket(const struct bpf_htab *htab,
>                                    struct bucket *b, u32 hash,
>                                    unsigned long *pflags)
> {
>         unsigned long flags;
>
>         hash = hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1);
>
>         preempt_disable();
>         if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
>                 __this_cpu_dec(*(htab->map_locked[hash]));
>                 preempt_enable();
>                 return -EBUSY;
>         }
>
>         if (in_nmi()) {
>                 if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>                         return -EBUSY;
The only purpose of trylock here is to make lockdep happy and it may lead to
unnecessary -EBUSY error for htab operations in NMI context. I still prefer add
a virtual lock-class for map_locked to fix the lockdep warning. So could you use
separated patches to fix the potential dead-lock and the lockdep warning ? It
will be better you can also add a bpf selftests for deadlock problem as said before.

Thanks,
Tao
>         } else {
>                 raw_spin_lock_irqsave(&b->raw_lock, flags);
>         }
>
>         *pflags = flags;
>         return 0;
> }
>
>
>> lockdep deduces that may be a dead-lock. I have already tried to use the same
>> map_locked for keys with the same bucket, the dead-lock is gone, but still got
>> lockdep warning.
>>> Hao
>>> .
>
Hao Luo Nov. 30, 2022, 5:02 a.m. UTC | #26
On Tue, Nov 29, 2022 at 8:13 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> On 11/30/2022 10:47 AM, Tonghao Zhang wrote:
<...>
> >         if (in_nmi()) {
> >                 if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
> >                         return -EBUSY;
>
> The only purpose of trylock here is to make lockdep happy and it may lead to
> unnecessary -EBUSY error for htab operations in NMI context. I still prefer add
> a virtual lock-class for map_locked to fix the lockdep warning. So could you use
> separated patches to fix the potential dead-lock and the lockdep warning ? It
> will be better you can also add a bpf selftests for deadlock problem as said before.
>

Agree with Tao here. Tonghao, could you send another version which:

- separates the fix to deadlock and the fix to lockdep warning
- includes a bpf selftest to verify the fix to deadlock
- with bpf-specific tag: [PATCH bpf-next]

There are multiple ideas on the fly in this thread, it's easy to lose
track of what has been proposed and what change you intend to make.

Thanks,
Hao
Tonghao Zhang Nov. 30, 2022, 5:55 a.m. UTC | #27
On Wed, Nov 30, 2022 at 12:13 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 11/30/2022 10:47 AM, Tonghao Zhang wrote:
> > On Wed, Nov 30, 2022 at 9:50 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >> Hi Hao,
> >>
> >> On 11/30/2022 3:36 AM, Hao Luo wrote:
> >>> On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
> >>>> Just to be clear, I meant to refactor htab_lock_bucket() into a try
> >>>> lock pattern. Also after a second thought, the below suggestion doesn't
> >>>> work. I think the proper way is to make htab_lock_bucket() as a
> >>>> raw_spin_trylock_irqsave().
> >>>>
> >>>> Regards,
> >>>> Boqun
> >>>>
> >>> The potential deadlock happens when the lock is contended from the
> >>> same cpu. When the lock is contended from a remote cpu, we would like
> >>> the remote cpu to spin and wait, instead of giving up immediately. As
> >>> this gives better throughput. So replacing the current
> >>> raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
> >>>
> >>> I suspect the source of the problem is the 'hash' that we used in
> >>> htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
> >>> whether we should use a hash derived from 'bucket' rather than from
> >>> 'key'. For example, from the memory address of the 'bucket'. Because,
> >>> different keys may fall into the same bucket, but yield different
> >>> hashes. If the same bucket can never have two different 'hashes' here,
> >>> the map_locked check should behave as intended. Also because
> >>> ->map_locked is per-cpu, execution flows from two different cpus can
> >>> both pass.
> >> The warning from lockdep is due to the reason the bucket lock A is used in a
> >> no-NMI context firstly, then the same bucke lock is used a NMI context, so
> > Yes, I tested lockdep too, we can't use the lock in NMI(but only
> > try_lock work fine) context if we use them no-NMI context. otherwise
> > the lockdep prints the warning.
> > * for the dead-lock case: we can use the
> > 1. hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1)
> > 2. or hash bucket address.
> Use the computed hash will be better than hash bucket address, because the hash
> buckets are allocated sequentially.
> >
> > * for lockdep warning, we should use in_nmi check with map_locked.
> >
> > BTW, the patch doesn't work, so we can remove the lock_key
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c50eb518e262fa06bd334e6eec172eaf5d7a5bd9
> >
> > static inline int htab_lock_bucket(const struct bpf_htab *htab,
> >                                    struct bucket *b, u32 hash,
> >                                    unsigned long *pflags)
> > {
> >         unsigned long flags;
> >
> >         hash = hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1);
> >
> >         preempt_disable();
> >         if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
> >                 __this_cpu_dec(*(htab->map_locked[hash]));
> >                 preempt_enable();
> >                 return -EBUSY;
> >         }
> >
> >         if (in_nmi()) {
> >                 if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
> >                         return -EBUSY;
> The only purpose of trylock here is to make lockdep happy and it may lead to
> unnecessary -EBUSY error for htab operations in NMI context. I still prefer add
> a virtual lock-class for map_locked to fix the lockdep warning. So could you use
Hi, what is virtual lock-class ? Can you give me an example of what you mean?
> separated patches to fix the potential dead-lock and the lockdep warning ? It
> will be better you can also add a bpf selftests for deadlock problem as said before.
>
> Thanks,
> Tao
> >         } else {
> >                 raw_spin_lock_irqsave(&b->raw_lock, flags);
> >         }
> >
> >         *pflags = flags;
> >         return 0;
> > }
> >
> >
> >> lockdep deduces that may be a dead-lock. I have already tried to use the same
> >> map_locked for keys with the same bucket, the dead-lock is gone, but still got
> >> lockdep warning.
> >>> Hao
> >>> .
> >
>
Tonghao Zhang Nov. 30, 2022, 5:56 a.m. UTC | #28
On Wed, Nov 30, 2022 at 1:02 PM Hao Luo <haoluo@google.com> wrote:
>
> On Tue, Nov 29, 2022 at 8:13 PM Hou Tao <houtao@huaweicloud.com> wrote:
> >
> > On 11/30/2022 10:47 AM, Tonghao Zhang wrote:
> <...>
> > >         if (in_nmi()) {
> > >                 if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
> > >                         return -EBUSY;
> >
> > The only purpose of trylock here is to make lockdep happy and it may lead to
> > unnecessary -EBUSY error for htab operations in NMI context. I still prefer add
> > a virtual lock-class for map_locked to fix the lockdep warning. So could you use
> > separated patches to fix the potential dead-lock and the lockdep warning ? It
> > will be better you can also add a bpf selftests for deadlock problem as said before.
> >
>
> Agree with Tao here. Tonghao, could you send another version which:
>
> - separates the fix to deadlock and the fix to lockdep warning
> - includes a bpf selftest to verify the fix to deadlock
> - with bpf-specific tag: [PATCH bpf-next]
>
> There are multiple ideas on the fly in this thread, it's easy to lose
> track of what has been proposed and what change you intend to make.
Hi, I will send v2 soon. Thanks.
> Thanks,
> Hao
Hou Tao Dec. 1, 2022, 2:53 a.m. UTC | #29
Hi,

On 11/30/2022 1:55 PM, Tonghao Zhang wrote:
> On Wed, Nov 30, 2022 at 12:13 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi,
>>
>> On 11/30/2022 10:47 AM, Tonghao Zhang wrote:
>>> On Wed, Nov 30, 2022 at 9:50 AM Hou Tao <houtao@huaweicloud.com> wrote:
>>>> Hi Hao,
>>>>
>>>> On 11/30/2022 3:36 AM, Hao Luo wrote:
>>>>> On Tue, Nov 29, 2022 at 9:32 AM Boqun Feng <boqun.feng@gmail.com> wrote:
>>>>>> Just to be clear, I meant to refactor htab_lock_bucket() into a try
>>>>>> lock pattern. Also after a second thought, the below suggestion doesn't
>>>>>> work. I think the proper way is to make htab_lock_bucket() as a
>>>>>> raw_spin_trylock_irqsave().
>>>>>>
>>>>>> Regards,
>>>>>> Boqun
>>>>>>
>>>>> The potential deadlock happens when the lock is contended from the
>>>>> same cpu. When the lock is contended from a remote cpu, we would like
>>>>> the remote cpu to spin and wait, instead of giving up immediately. As
>>>>> this gives better throughput. So replacing the current
>>>>> raw_spin_lock_irqsave() with trylock sacrifices this performance gain.
>>>>>
>>>>> I suspect the source of the problem is the 'hash' that we used in
>>>>> htab_lock_bucket(). The 'hash' is derived from the 'key', I wonder
>>>>> whether we should use a hash derived from 'bucket' rather than from
>>>>> 'key'. For example, from the memory address of the 'bucket'. Because,
>>>>> different keys may fall into the same bucket, but yield different
>>>>> hashes. If the same bucket can never have two different 'hashes' here,
>>>>> the map_locked check should behave as intended. Also because
>>>>> ->map_locked is per-cpu, execution flows from two different cpus can
>>>>> both pass.
>>>> The warning from lockdep is due to the reason the bucket lock A is used in a
>>>> no-NMI context firstly, then the same bucke lock is used a NMI context, so
>>> Yes, I tested lockdep too, we can't use the lock in NMI(but only
>>> try_lock work fine) context if we use them no-NMI context. otherwise
>>> the lockdep prints the warning.
>>> * for the dead-lock case: we can use the
>>> 1. hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1)
>>> 2. or hash bucket address.
>> Use the computed hash will be better than hash bucket address, because the hash
>> buckets are allocated sequentially.
>>> * for lockdep warning, we should use in_nmi check with map_locked.
>>>
>>> BTW, the patch doesn't work, so we can remove the lock_key
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c50eb518e262fa06bd334e6eec172eaf5d7a5bd9
>>>
>>> static inline int htab_lock_bucket(const struct bpf_htab *htab,
>>>                                    struct bucket *b, u32 hash,
>>>                                    unsigned long *pflags)
>>> {
>>>         unsigned long flags;
>>>
>>>         hash = hash & min(HASHTAB_MAP_LOCK_MASK, htab->n_buckets -1);
>>>
>>>         preempt_disable();
>>>         if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
>>>                 __this_cpu_dec(*(htab->map_locked[hash]));
>>>                 preempt_enable();
>>>                 return -EBUSY;
>>>         }
>>>
>>>         if (in_nmi()) {
>>>                 if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
>>>                         return -EBUSY;
>> The only purpose of trylock here is to make lockdep happy and it may lead to
>> unnecessary -EBUSY error for htab operations in NMI context. I still prefer add
>> a virtual lock-class for map_locked to fix the lockdep warning. So could you use
> Hi, what is virtual lock-class ? Can you give me an example of what you mean?
If LOCKDEP is enabled, raw_spinlock will add dep_map in the definition and it
also calls lock_acquire() and lock_release() to assist the deadlock check. Now
map_locked is not a lock but it acts like a raw_spin_trylock, so we need to add
dep_map to it manually, and then also call lock_acquire(trylock=1) and
lock_release() before increasing and decreasing map_locked. You can reference
the implementation of raw_spin_trylock and raw_spin_unlock for more details.
>> separated patches to fix the potential dead-lock and the lockdep warning ? It
>> will be better you can also add a bpf selftests for deadlock problem as said before.
>>
>> Thanks,
>> Tao
>>>         } else {
>>>                 raw_spin_lock_irqsave(&b->raw_lock, flags);
>>>         }
>>>
>>>         *pflags = flags;
>>>         return 0;
>>> }
>>>
>>>
>>>> lockdep deduces that may be a dead-lock. I have already tried to use the same
>>>> map_locked for keys with the same bucket, the dead-lock is gone, but still got
>>>> lockdep warning.
>>>>> Hao
>>>>> .
>
diff mbox series

Patch

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22855d6ff6d3..429acd97c869 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -80,9 +80,6 @@  struct bucket {
 	raw_spinlock_t raw_lock;
 };
 
-#define HASHTAB_MAP_LOCK_COUNT 8
-#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
-
 struct bpf_htab {
 	struct bpf_map map;
 	struct bpf_mem_alloc ma;
@@ -104,7 +101,6 @@  struct bpf_htab {
 	u32 elem_size;	/* size of each element in bytes */
 	u32 hashrnd;
 	struct lock_class_key lockdep_key;
-	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
 };
 
 /* each htab element is struct htab_elem + key + value */
@@ -146,35 +142,26 @@  static void htab_init_buckets(struct bpf_htab *htab)
 	}
 }
 
-static inline int htab_lock_bucket(const struct bpf_htab *htab,
-				   struct bucket *b, u32 hash,
+static inline int htab_lock_bucket(struct bucket *b,
 				   unsigned long *pflags)
 {
 	unsigned long flags;
 
-	hash = hash & HASHTAB_MAP_LOCK_MASK;
-
-	preempt_disable();
-	if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
-		__this_cpu_dec(*(htab->map_locked[hash]));
-		preempt_enable();
-		return -EBUSY;
+	if (in_nmi()) {
+		if (!raw_spin_trylock_irqsave(&b->raw_lock, flags))
+			return -EBUSY;
+	} else {
+		raw_spin_lock_irqsave(&b->raw_lock, flags);
 	}
 
-	raw_spin_lock_irqsave(&b->raw_lock, flags);
 	*pflags = flags;
-
 	return 0;
 }
 
-static inline void htab_unlock_bucket(const struct bpf_htab *htab,
-				      struct bucket *b, u32 hash,
+static inline void htab_unlock_bucket(struct bucket *b,
 				      unsigned long flags)
 {
-	hash = hash & HASHTAB_MAP_LOCK_MASK;
 	raw_spin_unlock_irqrestore(&b->raw_lock, flags);
-	__this_cpu_dec(*(htab->map_locked[hash]));
-	preempt_enable();
 }
 
 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -467,7 +454,7 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 	struct bpf_htab *htab;
-	int err, i;
+	int err;
 
 	htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
 	if (!htab)
@@ -513,15 +500,6 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (!htab->buckets)
 		goto free_htab;
 
-	for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
-		htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
-							   sizeof(int),
-							   sizeof(int),
-							   GFP_USER);
-		if (!htab->map_locked[i])
-			goto free_map_locked;
-	}
-
 	if (htab->map.map_flags & BPF_F_ZERO_SEED)
 		htab->hashrnd = 0;
 	else
@@ -549,13 +527,13 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (htab->use_percpu_counter) {
 		err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
 		if (err)
-			goto free_map_locked;
+			goto free_buckets;
 	}
 
 	if (prealloc) {
 		err = prealloc_init(htab);
 		if (err)
-			goto free_map_locked;
+			goto free_buckets;
 
 		if (!percpu && !lru) {
 			/* lru itself can remove the least used element, so
@@ -568,12 +546,12 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	} else {
 		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
 		if (err)
-			goto free_map_locked;
+			goto free_buckets;
 		if (percpu) {
 			err = bpf_mem_alloc_init(&htab->pcpu_ma,
 						 round_up(htab->map.value_size, 8), true);
 			if (err)
-				goto free_map_locked;
+				goto free_buckets;
 		}
 	}
 
@@ -581,11 +559,10 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 free_prealloc:
 	prealloc_destroy(htab);
-free_map_locked:
+free_buckets:
 	if (htab->use_percpu_counter)
 		percpu_counter_destroy(&htab->pcount);
-	for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
-		free_percpu(htab->map_locked[i]);
+
 	bpf_map_area_free(htab->buckets);
 	bpf_mem_alloc_destroy(&htab->pcpu_ma);
 	bpf_mem_alloc_destroy(&htab->ma);
@@ -782,7 +759,7 @@  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 	b = __select_bucket(htab, tgt_l->hash);
 	head = &b->head;
 
-	ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
+	ret = htab_lock_bucket(b, &flags);
 	if (ret)
 		return false;
 
@@ -793,7 +770,7 @@  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 			break;
 		}
 
-	htab_unlock_bucket(htab, b, tgt_l->hash, flags);
+	htab_unlock_bucket(b, flags);
 
 	return l == tgt_l;
 }
@@ -1107,7 +1084,7 @@  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		 */
 	}
 
-	ret = htab_lock_bucket(htab, b, hash, &flags);
+	ret = htab_lock_bucket(b, &flags);
 	if (ret)
 		return ret;
 
@@ -1152,7 +1129,7 @@  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	}
 	ret = 0;
 err:
-	htab_unlock_bucket(htab, b, hash, flags);
+	htab_unlock_bucket(b, flags);
 	return ret;
 }
 
@@ -1198,7 +1175,7 @@  static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	copy_map_value(&htab->map,
 		       l_new->key + round_up(map->key_size, 8), value);
 
-	ret = htab_lock_bucket(htab, b, hash, &flags);
+	ret = htab_lock_bucket(b, &flags);
 	if (ret)
 		return ret;
 
@@ -1219,7 +1196,7 @@  static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	ret = 0;
 
 err:
-	htab_unlock_bucket(htab, b, hash, flags);
+	htab_unlock_bucket(b, flags);
 
 	if (ret)
 		htab_lru_push_free(htab, l_new);
@@ -1255,7 +1232,7 @@  static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	ret = htab_lock_bucket(htab, b, hash, &flags);
+	ret = htab_lock_bucket(b, &flags);
 	if (ret)
 		return ret;
 
@@ -1280,7 +1257,7 @@  static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	}
 	ret = 0;
 err:
-	htab_unlock_bucket(htab, b, hash, flags);
+	htab_unlock_bucket(b, flags);
 	return ret;
 }
 
@@ -1321,7 +1298,7 @@  static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 			return -ENOMEM;
 	}
 
-	ret = htab_lock_bucket(htab, b, hash, &flags);
+	ret = htab_lock_bucket(b, &flags);
 	if (ret)
 		return ret;
 
@@ -1345,7 +1322,7 @@  static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 	}
 	ret = 0;
 err:
-	htab_unlock_bucket(htab, b, hash, flags);
+	htab_unlock_bucket(b, flags);
 	if (l_new)
 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
 	return ret;
@@ -1384,7 +1361,7 @@  static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	ret = htab_lock_bucket(htab, b, hash, &flags);
+	ret = htab_lock_bucket(b, &flags);
 	if (ret)
 		return ret;
 
@@ -1397,7 +1374,7 @@  static int htab_map_delete_elem(struct bpf_map *map, void *key)
 		ret = -ENOENT;
 	}
 
-	htab_unlock_bucket(htab, b, hash, flags);
+	htab_unlock_bucket(b, flags);
 	return ret;
 }
 
@@ -1420,7 +1397,7 @@  static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	ret = htab_lock_bucket(htab, b, hash, &flags);
+	ret = htab_lock_bucket(b, &flags);
 	if (ret)
 		return ret;
 
@@ -1431,7 +1408,7 @@  static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 	else
 		ret = -ENOENT;
 
-	htab_unlock_bucket(htab, b, hash, flags);
+	htab_unlock_bucket(b, flags);
 	if (l)
 		htab_lru_push_free(htab, l);
 	return ret;
@@ -1494,7 +1471,6 @@  static void htab_map_free_timers(struct bpf_map *map)
 static void htab_map_free(struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	int i;
 
 	/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
 	 * bpf_free_used_maps() is called after bpf prog is no longer executing.
@@ -1517,10 +1493,10 @@  static void htab_map_free(struct bpf_map *map)
 	bpf_map_area_free(htab->buckets);
 	bpf_mem_alloc_destroy(&htab->pcpu_ma);
 	bpf_mem_alloc_destroy(&htab->ma);
+
 	if (htab->use_percpu_counter)
 		percpu_counter_destroy(&htab->pcount);
-	for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
-		free_percpu(htab->map_locked[i]);
+
 	lockdep_unregister_key(&htab->lockdep_key);
 	bpf_map_area_free(htab);
 }
@@ -1564,7 +1540,7 @@  static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	ret = htab_lock_bucket(htab, b, hash, &bflags);
+	ret = htab_lock_bucket(b, &bflags);
 	if (ret)
 		return ret;
 
@@ -1602,7 +1578,7 @@  static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
 			free_htab_elem(htab, l);
 	}
 
-	htab_unlock_bucket(htab, b, hash, bflags);
+	htab_unlock_bucket(b, bflags);
 
 	if (is_lru_map && l)
 		htab_lru_push_free(htab, l);
@@ -1720,7 +1696,7 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 	head = &b->head;
 	/* do not grab the lock unless need it (bucket_cnt > 0). */
 	if (locked) {
-		ret = htab_lock_bucket(htab, b, batch, &flags);
+		ret = htab_lock_bucket(b, &flags);
 		if (ret) {
 			rcu_read_unlock();
 			bpf_enable_instrumentation();
@@ -1743,7 +1719,7 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 		/* Note that since bucket_cnt > 0 here, it is implicit
 		 * that the locked was grabbed, so release it.
 		 */
-		htab_unlock_bucket(htab, b, batch, flags);
+		htab_unlock_bucket(b, flags);
 		rcu_read_unlock();
 		bpf_enable_instrumentation();
 		goto after_loop;
@@ -1754,7 +1730,7 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 		/* Note that since bucket_cnt > 0 here, it is implicit
 		 * that the locked was grabbed, so release it.
 		 */
-		htab_unlock_bucket(htab, b, batch, flags);
+		htab_unlock_bucket(b, flags);
 		rcu_read_unlock();
 		bpf_enable_instrumentation();
 		kvfree(keys);
@@ -1815,7 +1791,7 @@  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 		dst_val += value_size;
 	}
 
-	htab_unlock_bucket(htab, b, batch, flags);
+	htab_unlock_bucket(b, flags);
 	locked = false;
 
 	while (node_to_free) {