Message ID | 20221219041551.69344-2-xiangxia.m.yue@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | [bpf-next,v3,1/2] bpf: hash map, avoid deadlock with suitable hash mask | expand |
On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: > From: Tonghao Zhang <xiangxia.m.yue@gmail.com> > > This testing show how to reproduce deadlock in special case. > We update htab map in Task and NMI context. Task can be interrupted by > NMI, if the same map bucket was locked, there will be a deadlock. > > * map max_entries is 2. > * NMI using key 4 and Task context using key 20. > * so same bucket index but map_locked index is different. > > The selftest use perf to produce the NMI and fentry nmi_handle. > Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update > map syscall increase this counter in bpf_disable_instrumentation. > Then fentry nmi_handle and update hash map will reproduce the issue. > > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> > Cc: Alexei Starovoitov <ast@kernel.org> > Cc: Daniel Borkmann <daniel@iogearbox.net> > Cc: Andrii Nakryiko <andrii@kernel.org> > Cc: Martin KaFai Lau <martin.lau@linux.dev> > Cc: Song Liu <song@kernel.org> > Cc: Yonghong Song <yhs@fb.com> > Cc: John Fastabend <john.fastabend@gmail.com> > Cc: KP Singh <kpsingh@kernel.org> > Cc: Stanislav Fomichev <sdf@google.com> > Cc: Hao Luo <haoluo@google.com> > Cc: Jiri Olsa <jolsa@kernel.org> > Cc: Hou Tao <houtao1@huawei.com> > Acked-by: Yonghong Song <yhs@fb.com> > --- > tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + > tools/testing/selftests/bpf/DENYLIST.s390x | 1 + > .../selftests/bpf/prog_tests/htab_deadlock.c | 75 +++++++++++++++++++ > .../selftests/bpf/progs/htab_deadlock.c | 32 ++++++++ > 4 files changed, 109 insertions(+) > create mode 100644 tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > create mode 100644 tools/testing/selftests/bpf/progs/htab_deadlock.c > > diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 > index 99cc33c51eaa..87e8fc9c9df2 100644 > --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 > +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 > @@ -24,6 +24,7 @@ fexit_test # fexit_attach unexpected error > get_func_args_test # get_func_args_test__attach unexpected error: -524 (errno 524) (trampoline) > get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (errno 524) (trampoline) > htab_update/reenter_update > +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) > kfree_skb # attach fentry unexpected error: -524 (trampoline) > kfunc_call/subprog # extern (var ksym) 'bpf_prog_active': not found in kernel BTF > kfunc_call/subprog_lskel # skel unexpected error: -2 > diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x > index 585fcf73c731..735239b31050 100644 > --- a/tools/testing/selftests/bpf/DENYLIST.s390x > +++ b/tools/testing/selftests/bpf/DENYLIST.s390x > @@ -26,6 +26,7 @@ get_func_args_test # trampoline > get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (trampoline) > get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) > htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) > +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) > kfree_skb # attach fentry unexpected error: -524 (trampoline) > kfunc_call # 'bpf_prog_active': not found in kernel BTF (?) > kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) > diff --git a/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > new file mode 100644 > index 000000000000..137dce8f1346 > --- /dev/null > +++ b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > @@ -0,0 +1,75 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright (c) 2022 DiDi Global Inc. */ > +#define _GNU_SOURCE > +#include <pthread.h> > +#include <sched.h> > +#include <test_progs.h> > + > +#include "htab_deadlock.skel.h" > + > +static int perf_event_open(void) > +{ > + struct perf_event_attr attr = {0}; > + int pfd; > + > + /* create perf event on CPU 0 */ > + attr.size = sizeof(attr); > + attr.type = PERF_TYPE_HARDWARE; > + attr.config = PERF_COUNT_HW_CPU_CYCLES; > + attr.freq = 1; > + attr.sample_freq = 1000; > + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); > + > + return pfd >= 0 ? pfd : -errno; > +} > + > +void test_htab_deadlock(void) > +{ > + unsigned int val = 0, key = 20; > + struct bpf_link *link = NULL; > + struct htab_deadlock *skel; > + int err, i, pfd; > + cpu_set_t cpus; > + > + skel = htab_deadlock__open_and_load(); > + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) > + return; > + > + err = htab_deadlock__attach(skel); > + if (!ASSERT_OK(err, "skel_attach")) > + goto clean_skel; > + > + /* NMI events. */ > + pfd = perf_event_open(); > + if (pfd < 0) { > + if (pfd == -ENOENT || pfd == -EOPNOTSUPP) { > + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); > + test__skip(); > + goto clean_skel; > + } > + if (!ASSERT_GE(pfd, 0, "perf_event_open")) > + goto clean_skel; > + } > + > + link = bpf_program__attach_perf_event(skel->progs.bpf_empty, pfd); > + if (!ASSERT_OK_PTR(link, "attach_perf_event")) > + goto clean_pfd; > + > + /* Pinned on CPU 0 */ > + CPU_ZERO(&cpus); > + CPU_SET(0, &cpus); > + pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus); > + > + /* update bpf map concurrently on CPU0 in NMI and Task context. > + * there should be no kernel deadlock. > + */ > + for (i = 0; i < 100000; i++) > + bpf_map_update_elem(bpf_map__fd(skel->maps.htab), > + &key, &val, BPF_ANY); > + > + bpf_link__destroy(link); > +clean_pfd: > + close(pfd); > +clean_skel: > + htab_deadlock__destroy(skel); > +} > diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c b/tools/testing/selftests/bpf/progs/htab_deadlock.c > new file mode 100644 > index 000000000000..d394f95e97c3 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c > @@ -0,0 +1,32 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright (c) 2022 DiDi Global Inc. */ > +#include <linux/bpf.h> > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_tracing.h> > + > +char _license[] SEC("license") = "GPL"; > + > +struct { > + __uint(type, BPF_MAP_TYPE_HASH); > + __uint(max_entries, 2); > + __uint(map_flags, BPF_F_ZERO_SEED); > + __type(key, unsigned int); > + __type(value, unsigned int); > +} htab SEC(".maps"); > + > +/* nmi_handle on x86 platform. If changing keyword > + * "static" to "inline", this prog load failed. */ > +SEC("fentry/nmi_handle") The above comment is not what I mean. In arch/x86/kernel/nmi.c, we have static int nmi_handle(unsigned int type, struct pt_regs *regs) { ... } ... static noinstr void default_do_nmi(struct pt_regs *regs) { ... handled = nmi_handle(NMI_LOCAL, regs); ... } Since nmi_handle is a static function, it is possible that the function might be inlined in default_do_nmi by the compiler. If this happens, fentry/nmi_handle will not be triggered and the test will pass. So I suggest to change the comment to nmi_handle() is a static function and might be inlined into its caller. If this happens, the test can still pass without previous kernel fix. > +int bpf_nmi_handle(struct pt_regs *regs) > +{ > + unsigned int val = 0, key = 4; > + > + bpf_map_update_elem(&htab, &key, &val, BPF_ANY); > + return 0; > +} > + > +SEC("perf_event") > +int bpf_empty(struct pt_regs *regs) > +{ > + return 0; > +}
On Tue, Dec 27, 2022 at 8:43 PM Yonghong Song <yhs@meta.com> wrote: > > > > On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: > > From: Tonghao Zhang <xiangxia.m.yue@gmail.com> > > > > This testing show how to reproduce deadlock in special case. > > We update htab map in Task and NMI context. Task can be interrupted by > > NMI, if the same map bucket was locked, there will be a deadlock. > > > > * map max_entries is 2. > > * NMI using key 4 and Task context using key 20. > > * so same bucket index but map_locked index is different. > > > > The selftest use perf to produce the NMI and fentry nmi_handle. > > Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update > > map syscall increase this counter in bpf_disable_instrumentation. > > Then fentry nmi_handle and update hash map will reproduce the issue. > > > > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> > > Cc: Alexei Starovoitov <ast@kernel.org> > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > Cc: Andrii Nakryiko <andrii@kernel.org> > > Cc: Martin KaFai Lau <martin.lau@linux.dev> > > Cc: Song Liu <song@kernel.org> > > Cc: Yonghong Song <yhs@fb.com> > > Cc: John Fastabend <john.fastabend@gmail.com> > > Cc: KP Singh <kpsingh@kernel.org> > > Cc: Stanislav Fomichev <sdf@google.com> > > Cc: Hao Luo <haoluo@google.com> > > Cc: Jiri Olsa <jolsa@kernel.org> > > Cc: Hou Tao <houtao1@huawei.com> > > Acked-by: Yonghong Song <yhs@fb.com> > > --- > > tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + > > tools/testing/selftests/bpf/DENYLIST.s390x | 1 + > > .../selftests/bpf/prog_tests/htab_deadlock.c | 75 +++++++++++++++++++ > > .../selftests/bpf/progs/htab_deadlock.c | 32 ++++++++ > > 4 files changed, 109 insertions(+) > > create mode 100644 tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > > create mode 100644 tools/testing/selftests/bpf/progs/htab_deadlock.c > > > > diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 > > index 99cc33c51eaa..87e8fc9c9df2 100644 > > --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 > > +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 > > @@ -24,6 +24,7 @@ fexit_test # fexit_attach unexpected error > > get_func_args_test # get_func_args_test__attach unexpected error: -524 (errno 524) (trampoline) > > get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (errno 524) (trampoline) > > htab_update/reenter_update > > +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) > > kfree_skb # attach fentry unexpected error: -524 (trampoline) > > kfunc_call/subprog # extern (var ksym) 'bpf_prog_active': not found in kernel BTF > > kfunc_call/subprog_lskel # skel unexpected error: -2 > > diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x > > index 585fcf73c731..735239b31050 100644 > > --- a/tools/testing/selftests/bpf/DENYLIST.s390x > > +++ b/tools/testing/selftests/bpf/DENYLIST.s390x > > @@ -26,6 +26,7 @@ get_func_args_test # trampoline > > get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (trampoline) > > get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) > > htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) > > +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) > > kfree_skb # attach fentry unexpected error: -524 (trampoline) > > kfunc_call # 'bpf_prog_active': not found in kernel BTF (?) > > kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) > > diff --git a/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > > new file mode 100644 > > index 000000000000..137dce8f1346 > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > > @@ -0,0 +1,75 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* Copyright (c) 2022 DiDi Global Inc. */ > > +#define _GNU_SOURCE > > +#include <pthread.h> > > +#include <sched.h> > > +#include <test_progs.h> > > + > > +#include "htab_deadlock.skel.h" > > + > > +static int perf_event_open(void) > > +{ > > + struct perf_event_attr attr = {0}; > > + int pfd; > > + > > + /* create perf event on CPU 0 */ > > + attr.size = sizeof(attr); > > + attr.type = PERF_TYPE_HARDWARE; > > + attr.config = PERF_COUNT_HW_CPU_CYCLES; > > + attr.freq = 1; > > + attr.sample_freq = 1000; > > + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); > > + > > + return pfd >= 0 ? pfd : -errno; > > +} > > + > > +void test_htab_deadlock(void) > > +{ > > + unsigned int val = 0, key = 20; > > + struct bpf_link *link = NULL; > > + struct htab_deadlock *skel; > > + int err, i, pfd; > > + cpu_set_t cpus; > > + > > + skel = htab_deadlock__open_and_load(); > > + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) > > + return; > > + > > + err = htab_deadlock__attach(skel); > > + if (!ASSERT_OK(err, "skel_attach")) > > + goto clean_skel; > > + > > + /* NMI events. */ > > + pfd = perf_event_open(); > > + if (pfd < 0) { > > + if (pfd == -ENOENT || pfd == -EOPNOTSUPP) { > > + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); > > + test__skip(); > > + goto clean_skel; > > + } > > + if (!ASSERT_GE(pfd, 0, "perf_event_open")) > > + goto clean_skel; > > + } > > + > > + link = bpf_program__attach_perf_event(skel->progs.bpf_empty, pfd); > > + if (!ASSERT_OK_PTR(link, "attach_perf_event")) > > + goto clean_pfd; > > + > > + /* Pinned on CPU 0 */ > > + CPU_ZERO(&cpus); > > + CPU_SET(0, &cpus); > > + pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus); > > + > > + /* update bpf map concurrently on CPU0 in NMI and Task context. > > + * there should be no kernel deadlock. > > + */ > > + for (i = 0; i < 100000; i++) > > + bpf_map_update_elem(bpf_map__fd(skel->maps.htab), > > + &key, &val, BPF_ANY); > > + > > + bpf_link__destroy(link); > > +clean_pfd: > > + close(pfd); > > +clean_skel: > > + htab_deadlock__destroy(skel); > > +} > > diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c b/tools/testing/selftests/bpf/progs/htab_deadlock.c > > new file mode 100644 > > index 000000000000..d394f95e97c3 > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c > > @@ -0,0 +1,32 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* Copyright (c) 2022 DiDi Global Inc. */ > > +#include <linux/bpf.h> > > +#include <bpf/bpf_helpers.h> > > +#include <bpf/bpf_tracing.h> > > + > > +char _license[] SEC("license") = "GPL"; > > + > > +struct { > > + __uint(type, BPF_MAP_TYPE_HASH); > > + __uint(max_entries, 2); > > + __uint(map_flags, BPF_F_ZERO_SEED); > > + __type(key, unsigned int); > > + __type(value, unsigned int); > > +} htab SEC(".maps"); > > + > > +/* nmi_handle on x86 platform. If changing keyword > > + * "static" to "inline", this prog load failed. */ > > +SEC("fentry/nmi_handle") > > The above comment is not what I mean. In arch/x86/kernel/nmi.c, > we have > static int nmi_handle(unsigned int type, struct pt_regs *regs) > { > ... > } > ... > static noinstr void default_do_nmi(struct pt_regs *regs) > { > ... > handled = nmi_handle(NMI_LOCAL, regs); > ... > } > > Since nmi_handle is a static function, it is possible that > the function might be inlined in default_do_nmi by the > compiler. If this happens, fentry/nmi_handle will not > be triggered and the test will pass. > > So I suggest to change the comment to > nmi_handle() is a static function and might be > inlined into its caller. If this happens, the > test can still pass without previous kernel fix. It's worse than this. fentry is buggy. We shouldn't allow attaching fentry to: NOKPROBE_SYMBOL(nmi_handle);
On 12/28/22 2:24 PM, Alexei Starovoitov wrote: > On Tue, Dec 27, 2022 at 8:43 PM Yonghong Song <yhs@meta.com> wrote: >> >> >> >> On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: >>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com> >>> >>> This testing show how to reproduce deadlock in special case. >>> We update htab map in Task and NMI context. Task can be interrupted by >>> NMI, if the same map bucket was locked, there will be a deadlock. >>> >>> * map max_entries is 2. >>> * NMI using key 4 and Task context using key 20. >>> * so same bucket index but map_locked index is different. >>> >>> The selftest use perf to produce the NMI and fentry nmi_handle. >>> Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update >>> map syscall increase this counter in bpf_disable_instrumentation. >>> Then fentry nmi_handle and update hash map will reproduce the issue. >>> >>> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> >>> Cc: Alexei Starovoitov <ast@kernel.org> >>> Cc: Daniel Borkmann <daniel@iogearbox.net> >>> Cc: Andrii Nakryiko <andrii@kernel.org> >>> Cc: Martin KaFai Lau <martin.lau@linux.dev> >>> Cc: Song Liu <song@kernel.org> >>> Cc: Yonghong Song <yhs@fb.com> >>> Cc: John Fastabend <john.fastabend@gmail.com> >>> Cc: KP Singh <kpsingh@kernel.org> >>> Cc: Stanislav Fomichev <sdf@google.com> >>> Cc: Hao Luo <haoluo@google.com> >>> Cc: Jiri Olsa <jolsa@kernel.org> >>> Cc: Hou Tao <houtao1@huawei.com> >>> Acked-by: Yonghong Song <yhs@fb.com> >>> --- >>> tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + >>> tools/testing/selftests/bpf/DENYLIST.s390x | 1 + >>> .../selftests/bpf/prog_tests/htab_deadlock.c | 75 +++++++++++++++++++ >>> .../selftests/bpf/progs/htab_deadlock.c | 32 ++++++++ >>> 4 files changed, 109 insertions(+) >>> create mode 100644 tools/testing/selftests/bpf/prog_tests/htab_deadlock.c >>> create mode 100644 tools/testing/selftests/bpf/progs/htab_deadlock.c >>> >>> diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 >>> index 99cc33c51eaa..87e8fc9c9df2 100644 >>> --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 >>> +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 >>> @@ -24,6 +24,7 @@ fexit_test # fexit_attach unexpected error >>> get_func_args_test # get_func_args_test__attach unexpected error: -524 (errno 524) (trampoline) >>> get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (errno 524) (trampoline) >>> htab_update/reenter_update >>> +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) >>> kfree_skb # attach fentry unexpected error: -524 (trampoline) >>> kfunc_call/subprog # extern (var ksym) 'bpf_prog_active': not found in kernel BTF >>> kfunc_call/subprog_lskel # skel unexpected error: -2 >>> diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x >>> index 585fcf73c731..735239b31050 100644 >>> --- a/tools/testing/selftests/bpf/DENYLIST.s390x >>> +++ b/tools/testing/selftests/bpf/DENYLIST.s390x >>> @@ -26,6 +26,7 @@ get_func_args_test # trampoline >>> get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (trampoline) >>> get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) >>> htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) >>> +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) >>> kfree_skb # attach fentry unexpected error: -524 (trampoline) >>> kfunc_call # 'bpf_prog_active': not found in kernel BTF (?) >>> kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) >>> diff --git a/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c >>> new file mode 100644 >>> index 000000000000..137dce8f1346 >>> --- /dev/null >>> +++ b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c >>> @@ -0,0 +1,75 @@ >>> +// SPDX-License-Identifier: GPL-2.0 >>> +/* Copyright (c) 2022 DiDi Global Inc. */ >>> +#define _GNU_SOURCE >>> +#include <pthread.h> >>> +#include <sched.h> >>> +#include <test_progs.h> >>> + >>> +#include "htab_deadlock.skel.h" >>> + >>> +static int perf_event_open(void) >>> +{ >>> + struct perf_event_attr attr = {0}; >>> + int pfd; >>> + >>> + /* create perf event on CPU 0 */ >>> + attr.size = sizeof(attr); >>> + attr.type = PERF_TYPE_HARDWARE; >>> + attr.config = PERF_COUNT_HW_CPU_CYCLES; >>> + attr.freq = 1; >>> + attr.sample_freq = 1000; >>> + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); >>> + >>> + return pfd >= 0 ? pfd : -errno; >>> +} >>> + >>> +void test_htab_deadlock(void) >>> +{ >>> + unsigned int val = 0, key = 20; >>> + struct bpf_link *link = NULL; >>> + struct htab_deadlock *skel; >>> + int err, i, pfd; >>> + cpu_set_t cpus; >>> + >>> + skel = htab_deadlock__open_and_load(); >>> + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) >>> + return; >>> + >>> + err = htab_deadlock__attach(skel); >>> + if (!ASSERT_OK(err, "skel_attach")) >>> + goto clean_skel; >>> + >>> + /* NMI events. */ >>> + pfd = perf_event_open(); >>> + if (pfd < 0) { >>> + if (pfd == -ENOENT || pfd == -EOPNOTSUPP) { >>> + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); >>> + test__skip(); >>> + goto clean_skel; >>> + } >>> + if (!ASSERT_GE(pfd, 0, "perf_event_open")) >>> + goto clean_skel; >>> + } >>> + >>> + link = bpf_program__attach_perf_event(skel->progs.bpf_empty, pfd); >>> + if (!ASSERT_OK_PTR(link, "attach_perf_event")) >>> + goto clean_pfd; >>> + >>> + /* Pinned on CPU 0 */ >>> + CPU_ZERO(&cpus); >>> + CPU_SET(0, &cpus); >>> + pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus); >>> + >>> + /* update bpf map concurrently on CPU0 in NMI and Task context. >>> + * there should be no kernel deadlock. >>> + */ >>> + for (i = 0; i < 100000; i++) >>> + bpf_map_update_elem(bpf_map__fd(skel->maps.htab), >>> + &key, &val, BPF_ANY); >>> + >>> + bpf_link__destroy(link); >>> +clean_pfd: >>> + close(pfd); >>> +clean_skel: >>> + htab_deadlock__destroy(skel); >>> +} >>> diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>> new file mode 100644 >>> index 000000000000..d394f95e97c3 >>> --- /dev/null >>> +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>> @@ -0,0 +1,32 @@ >>> +// SPDX-License-Identifier: GPL-2.0 >>> +/* Copyright (c) 2022 DiDi Global Inc. */ >>> +#include <linux/bpf.h> >>> +#include <bpf/bpf_helpers.h> >>> +#include <bpf/bpf_tracing.h> >>> + >>> +char _license[] SEC("license") = "GPL"; >>> + >>> +struct { >>> + __uint(type, BPF_MAP_TYPE_HASH); >>> + __uint(max_entries, 2); >>> + __uint(map_flags, BPF_F_ZERO_SEED); >>> + __type(key, unsigned int); >>> + __type(value, unsigned int); >>> +} htab SEC(".maps"); >>> + >>> +/* nmi_handle on x86 platform. If changing keyword >>> + * "static" to "inline", this prog load failed. */ >>> +SEC("fentry/nmi_handle") >> >> The above comment is not what I mean. In arch/x86/kernel/nmi.c, >> we have >> static int nmi_handle(unsigned int type, struct pt_regs *regs) >> { >> ... >> } >> ... >> static noinstr void default_do_nmi(struct pt_regs *regs) >> { >> ... >> handled = nmi_handle(NMI_LOCAL, regs); >> ... >> } >> >> Since nmi_handle is a static function, it is possible that >> the function might be inlined in default_do_nmi by the >> compiler. If this happens, fentry/nmi_handle will not >> be triggered and the test will pass. >> >> So I suggest to change the comment to >> nmi_handle() is a static function and might be >> inlined into its caller. If this happens, the >> test can still pass without previous kernel fix. > > It's worse than this. > fentry is buggy. > We shouldn't allow attaching fentry to: > NOKPROBE_SYMBOL(nmi_handle); Okay, I see. Looks we should prevent fentry from attaching any NOKPROBE_SYMBOL functions. BTW, I think fentry/nmi_handle can be replaced with tracepoint nmi/nmi_handler. it is more reliable and won't be impacted by potential NOKPROBE_SYMBOL issues.
a On Thu, Dec 29, 2022 at 2:29 PM Yonghong Song <yhs@meta.com> wrote: > > > > On 12/28/22 2:24 PM, Alexei Starovoitov wrote: > > On Tue, Dec 27, 2022 at 8:43 PM Yonghong Song <yhs@meta.com> wrote: > >> > >> > >> > >> On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: > >>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com> > >>> > >>> This testing show how to reproduce deadlock in special case. > >>> We update htab map in Task and NMI context. Task can be interrupted by > >>> NMI, if the same map bucket was locked, there will be a deadlock. > >>> > >>> * map max_entries is 2. > >>> * NMI using key 4 and Task context using key 20. > >>> * so same bucket index but map_locked index is different. > >>> > >>> The selftest use perf to produce the NMI and fentry nmi_handle. > >>> Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update > >>> map syscall increase this counter in bpf_disable_instrumentation. > >>> Then fentry nmi_handle and update hash map will reproduce the issue. > >>> > >>> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> > >>> Cc: Alexei Starovoitov <ast@kernel.org> > >>> Cc: Daniel Borkmann <daniel@iogearbox.net> > >>> Cc: Andrii Nakryiko <andrii@kernel.org> > >>> Cc: Martin KaFai Lau <martin.lau@linux.dev> > >>> Cc: Song Liu <song@kernel.org> > >>> Cc: Yonghong Song <yhs@fb.com> > >>> Cc: John Fastabend <john.fastabend@gmail.com> > >>> Cc: KP Singh <kpsingh@kernel.org> > >>> Cc: Stanislav Fomichev <sdf@google.com> > >>> Cc: Hao Luo <haoluo@google.com> > >>> Cc: Jiri Olsa <jolsa@kernel.org> > >>> Cc: Hou Tao <houtao1@huawei.com> > >>> Acked-by: Yonghong Song <yhs@fb.com> > >>> --- > >>> tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + > >>> tools/testing/selftests/bpf/DENYLIST.s390x | 1 + > >>> .../selftests/bpf/prog_tests/htab_deadlock.c | 75 +++++++++++++++++++ > >>> .../selftests/bpf/progs/htab_deadlock.c | 32 ++++++++ > >>> 4 files changed, 109 insertions(+) > >>> create mode 100644 tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > >>> create mode 100644 tools/testing/selftests/bpf/progs/htab_deadlock.c > >>> > >>> diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 > >>> index 99cc33c51eaa..87e8fc9c9df2 100644 > >>> --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 > >>> +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 > >>> @@ -24,6 +24,7 @@ fexit_test # fexit_attach unexpected error > >>> get_func_args_test # get_func_args_test__attach unexpected error: -524 (errno 524) (trampoline) > >>> get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (errno 524) (trampoline) > >>> htab_update/reenter_update > >>> +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) > >>> kfree_skb # attach fentry unexpected error: -524 (trampoline) > >>> kfunc_call/subprog # extern (var ksym) 'bpf_prog_active': not found in kernel BTF > >>> kfunc_call/subprog_lskel # skel unexpected error: -2 > >>> diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x > >>> index 585fcf73c731..735239b31050 100644 > >>> --- a/tools/testing/selftests/bpf/DENYLIST.s390x > >>> +++ b/tools/testing/selftests/bpf/DENYLIST.s390x > >>> @@ -26,6 +26,7 @@ get_func_args_test # trampoline > >>> get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (trampoline) > >>> get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) > >>> htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) > >>> +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) > >>> kfree_skb # attach fentry unexpected error: -524 (trampoline) > >>> kfunc_call # 'bpf_prog_active': not found in kernel BTF (?) > >>> kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) > >>> diff --git a/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > >>> new file mode 100644 > >>> index 000000000000..137dce8f1346 > >>> --- /dev/null > >>> +++ b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c > >>> @@ -0,0 +1,75 @@ > >>> +// SPDX-License-Identifier: GPL-2.0 > >>> +/* Copyright (c) 2022 DiDi Global Inc. */ > >>> +#define _GNU_SOURCE > >>> +#include <pthread.h> > >>> +#include <sched.h> > >>> +#include <test_progs.h> > >>> + > >>> +#include "htab_deadlock.skel.h" > >>> + > >>> +static int perf_event_open(void) > >>> +{ > >>> + struct perf_event_attr attr = {0}; > >>> + int pfd; > >>> + > >>> + /* create perf event on CPU 0 */ > >>> + attr.size = sizeof(attr); > >>> + attr.type = PERF_TYPE_HARDWARE; > >>> + attr.config = PERF_COUNT_HW_CPU_CYCLES; > >>> + attr.freq = 1; > >>> + attr.sample_freq = 1000; > >>> + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); > >>> + > >>> + return pfd >= 0 ? pfd : -errno; > >>> +} > >>> + > >>> +void test_htab_deadlock(void) > >>> +{ > >>> + unsigned int val = 0, key = 20; > >>> + struct bpf_link *link = NULL; > >>> + struct htab_deadlock *skel; > >>> + int err, i, pfd; > >>> + cpu_set_t cpus; > >>> + > >>> + skel = htab_deadlock__open_and_load(); > >>> + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) > >>> + return; > >>> + > >>> + err = htab_deadlock__attach(skel); > >>> + if (!ASSERT_OK(err, "skel_attach")) > >>> + goto clean_skel; > >>> + > >>> + /* NMI events. */ > >>> + pfd = perf_event_open(); > >>> + if (pfd < 0) { > >>> + if (pfd == -ENOENT || pfd == -EOPNOTSUPP) { > >>> + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); > >>> + test__skip(); > >>> + goto clean_skel; > >>> + } > >>> + if (!ASSERT_GE(pfd, 0, "perf_event_open")) > >>> + goto clean_skel; > >>> + } > >>> + > >>> + link = bpf_program__attach_perf_event(skel->progs.bpf_empty, pfd); > >>> + if (!ASSERT_OK_PTR(link, "attach_perf_event")) > >>> + goto clean_pfd; > >>> + > >>> + /* Pinned on CPU 0 */ > >>> + CPU_ZERO(&cpus); > >>> + CPU_SET(0, &cpus); > >>> + pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus); > >>> + > >>> + /* update bpf map concurrently on CPU0 in NMI and Task context. > >>> + * there should be no kernel deadlock. > >>> + */ > >>> + for (i = 0; i < 100000; i++) > >>> + bpf_map_update_elem(bpf_map__fd(skel->maps.htab), > >>> + &key, &val, BPF_ANY); > >>> + > >>> + bpf_link__destroy(link); > >>> +clean_pfd: > >>> + close(pfd); > >>> +clean_skel: > >>> + htab_deadlock__destroy(skel); > >>> +} > >>> diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c b/tools/testing/selftests/bpf/progs/htab_deadlock.c > >>> new file mode 100644 > >>> index 000000000000..d394f95e97c3 > >>> --- /dev/null > >>> +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c > >>> @@ -0,0 +1,32 @@ > >>> +// SPDX-License-Identifier: GPL-2.0 > >>> +/* Copyright (c) 2022 DiDi Global Inc. */ > >>> +#include <linux/bpf.h> > >>> +#include <bpf/bpf_helpers.h> > >>> +#include <bpf/bpf_tracing.h> > >>> + > >>> +char _license[] SEC("license") = "GPL"; > >>> + > >>> +struct { > >>> + __uint(type, BPF_MAP_TYPE_HASH); > >>> + __uint(max_entries, 2); > >>> + __uint(map_flags, BPF_F_ZERO_SEED); > >>> + __type(key, unsigned int); > >>> + __type(value, unsigned int); > >>> +} htab SEC(".maps"); > >>> + > >>> +/* nmi_handle on x86 platform. If changing keyword > >>> + * "static" to "inline", this prog load failed. */ > >>> +SEC("fentry/nmi_handle") > >> > >> The above comment is not what I mean. In arch/x86/kernel/nmi.c, > >> we have > >> static int nmi_handle(unsigned int type, struct pt_regs *regs) > >> { > >> ... > >> } > >> ... > >> static noinstr void default_do_nmi(struct pt_regs *regs) > >> { > >> ... > >> handled = nmi_handle(NMI_LOCAL, regs); > >> ... > >> } > >> > >> Since nmi_handle is a static function, it is possible that > >> the function might be inlined in default_do_nmi by the > >> compiler. If this happens, fentry/nmi_handle will not > >> be triggered and the test will pass. > >> > >> So I suggest to change the comment to > >> nmi_handle() is a static function and might be > >> inlined into its caller. If this happens, the > >> test can still pass without previous kernel fix. > > > > It's worse than this. > > fentry is buggy. > > We shouldn't allow attaching fentry to: > > NOKPROBE_SYMBOL(nmi_handle); > > Okay, I see. Looks we should prevent fentry from > attaching any NOKPROBE_SYMBOL functions. > > BTW, I think fentry/nmi_handle can be replaced with > tracepoint nmi/nmi_handler. it is more reliable The tracepoint will not reproduce the deadlock(we have discussed v2). If it's not easy to complete a test for this case, should we drop this testcase patch? or fentry the nmi_handle and update the comments. > and won't be impacted by potential NOKPROBE_SYMBOL > issues.
On 1/2/23 6:40 PM, Tonghao Zhang wrote: > a > > On Thu, Dec 29, 2022 at 2:29 PM Yonghong Song <yhs@meta.com> wrote: >> >> >> >> On 12/28/22 2:24 PM, Alexei Starovoitov wrote: >>> On Tue, Dec 27, 2022 at 8:43 PM Yonghong Song <yhs@meta.com> wrote: >>>> >>>> >>>> >>>> On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: >>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com> >>>>> >>>>> This testing show how to reproduce deadlock in special case. >>>>> We update htab map in Task and NMI context. Task can be interrupted by >>>>> NMI, if the same map bucket was locked, there will be a deadlock. >>>>> >>>>> * map max_entries is 2. >>>>> * NMI using key 4 and Task context using key 20. >>>>> * so same bucket index but map_locked index is different. >>>>> >>>>> The selftest use perf to produce the NMI and fentry nmi_handle. >>>>> Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update >>>>> map syscall increase this counter in bpf_disable_instrumentation. >>>>> Then fentry nmi_handle and update hash map will reproduce the issue. >>>>> >>>>> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com> >>>>> Cc: Alexei Starovoitov <ast@kernel.org> >>>>> Cc: Daniel Borkmann <daniel@iogearbox.net> >>>>> Cc: Andrii Nakryiko <andrii@kernel.org> >>>>> Cc: Martin KaFai Lau <martin.lau@linux.dev> >>>>> Cc: Song Liu <song@kernel.org> >>>>> Cc: Yonghong Song <yhs@fb.com> >>>>> Cc: John Fastabend <john.fastabend@gmail.com> >>>>> Cc: KP Singh <kpsingh@kernel.org> >>>>> Cc: Stanislav Fomichev <sdf@google.com> >>>>> Cc: Hao Luo <haoluo@google.com> >>>>> Cc: Jiri Olsa <jolsa@kernel.org> >>>>> Cc: Hou Tao <houtao1@huawei.com> >>>>> Acked-by: Yonghong Song <yhs@fb.com> >>>>> --- >>>>> tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + >>>>> tools/testing/selftests/bpf/DENYLIST.s390x | 1 + >>>>> .../selftests/bpf/prog_tests/htab_deadlock.c | 75 +++++++++++++++++++ >>>>> .../selftests/bpf/progs/htab_deadlock.c | 32 ++++++++ >>>>> 4 files changed, 109 insertions(+) >>>>> create mode 100644 tools/testing/selftests/bpf/prog_tests/htab_deadlock.c >>>>> create mode 100644 tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>> >>>>> diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 >>>>> index 99cc33c51eaa..87e8fc9c9df2 100644 >>>>> --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 >>>>> +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 >>>>> @@ -24,6 +24,7 @@ fexit_test # fexit_attach unexpected error >>>>> get_func_args_test # get_func_args_test__attach unexpected error: -524 (errno 524) (trampoline) >>>>> get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (errno 524) (trampoline) >>>>> htab_update/reenter_update >>>>> +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) >>>>> kfree_skb # attach fentry unexpected error: -524 (trampoline) >>>>> kfunc_call/subprog # extern (var ksym) 'bpf_prog_active': not found in kernel BTF >>>>> kfunc_call/subprog_lskel # skel unexpected error: -2 >>>>> diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x >>>>> index 585fcf73c731..735239b31050 100644 >>>>> --- a/tools/testing/selftests/bpf/DENYLIST.s390x >>>>> +++ b/tools/testing/selftests/bpf/DENYLIST.s390x >>>>> @@ -26,6 +26,7 @@ get_func_args_test # trampoline >>>>> get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (trampoline) >>>>> get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) >>>>> htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) >>>>> +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) >>>>> kfree_skb # attach fentry unexpected error: -524 (trampoline) >>>>> kfunc_call # 'bpf_prog_active': not found in kernel BTF (?) >>>>> kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) >>>>> diff --git a/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c >>>>> new file mode 100644 >>>>> index 000000000000..137dce8f1346 >>>>> --- /dev/null >>>>> +++ b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c >>>>> @@ -0,0 +1,75 @@ >>>>> +// SPDX-License-Identifier: GPL-2.0 >>>>> +/* Copyright (c) 2022 DiDi Global Inc. */ >>>>> +#define _GNU_SOURCE >>>>> +#include <pthread.h> >>>>> +#include <sched.h> >>>>> +#include <test_progs.h> >>>>> + >>>>> +#include "htab_deadlock.skel.h" >>>>> + >>>>> +static int perf_event_open(void) >>>>> +{ >>>>> + struct perf_event_attr attr = {0}; >>>>> + int pfd; >>>>> + >>>>> + /* create perf event on CPU 0 */ >>>>> + attr.size = sizeof(attr); >>>>> + attr.type = PERF_TYPE_HARDWARE; >>>>> + attr.config = PERF_COUNT_HW_CPU_CYCLES; >>>>> + attr.freq = 1; >>>>> + attr.sample_freq = 1000; >>>>> + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); >>>>> + >>>>> + return pfd >= 0 ? pfd : -errno; >>>>> +} >>>>> + >>>>> +void test_htab_deadlock(void) >>>>> +{ >>>>> + unsigned int val = 0, key = 20; >>>>> + struct bpf_link *link = NULL; >>>>> + struct htab_deadlock *skel; >>>>> + int err, i, pfd; >>>>> + cpu_set_t cpus; >>>>> + >>>>> + skel = htab_deadlock__open_and_load(); >>>>> + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) >>>>> + return; >>>>> + >>>>> + err = htab_deadlock__attach(skel); >>>>> + if (!ASSERT_OK(err, "skel_attach")) >>>>> + goto clean_skel; >>>>> + >>>>> + /* NMI events. */ >>>>> + pfd = perf_event_open(); >>>>> + if (pfd < 0) { >>>>> + if (pfd == -ENOENT || pfd == -EOPNOTSUPP) { >>>>> + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); >>>>> + test__skip(); >>>>> + goto clean_skel; >>>>> + } >>>>> + if (!ASSERT_GE(pfd, 0, "perf_event_open")) >>>>> + goto clean_skel; >>>>> + } >>>>> + >>>>> + link = bpf_program__attach_perf_event(skel->progs.bpf_empty, pfd); >>>>> + if (!ASSERT_OK_PTR(link, "attach_perf_event")) >>>>> + goto clean_pfd; >>>>> + >>>>> + /* Pinned on CPU 0 */ >>>>> + CPU_ZERO(&cpus); >>>>> + CPU_SET(0, &cpus); >>>>> + pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus); >>>>> + >>>>> + /* update bpf map concurrently on CPU0 in NMI and Task context. >>>>> + * there should be no kernel deadlock. >>>>> + */ >>>>> + for (i = 0; i < 100000; i++) >>>>> + bpf_map_update_elem(bpf_map__fd(skel->maps.htab), >>>>> + &key, &val, BPF_ANY); >>>>> + >>>>> + bpf_link__destroy(link); >>>>> +clean_pfd: >>>>> + close(pfd); >>>>> +clean_skel: >>>>> + htab_deadlock__destroy(skel); >>>>> +} >>>>> diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>> new file mode 100644 >>>>> index 000000000000..d394f95e97c3 >>>>> --- /dev/null >>>>> +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>> @@ -0,0 +1,32 @@ >>>>> +// SPDX-License-Identifier: GPL-2.0 >>>>> +/* Copyright (c) 2022 DiDi Global Inc. */ >>>>> +#include <linux/bpf.h> >>>>> +#include <bpf/bpf_helpers.h> >>>>> +#include <bpf/bpf_tracing.h> >>>>> + >>>>> +char _license[] SEC("license") = "GPL"; >>>>> + >>>>> +struct { >>>>> + __uint(type, BPF_MAP_TYPE_HASH); >>>>> + __uint(max_entries, 2); >>>>> + __uint(map_flags, BPF_F_ZERO_SEED); >>>>> + __type(key, unsigned int); >>>>> + __type(value, unsigned int); >>>>> +} htab SEC(".maps"); >>>>> + >>>>> +/* nmi_handle on x86 platform. If changing keyword >>>>> + * "static" to "inline", this prog load failed. */ >>>>> +SEC("fentry/nmi_handle") >>>> >>>> The above comment is not what I mean. In arch/x86/kernel/nmi.c, >>>> we have >>>> static int nmi_handle(unsigned int type, struct pt_regs *regs) >>>> { >>>> ... >>>> } >>>> ... >>>> static noinstr void default_do_nmi(struct pt_regs *regs) >>>> { >>>> ... >>>> handled = nmi_handle(NMI_LOCAL, regs); >>>> ... >>>> } >>>> >>>> Since nmi_handle is a static function, it is possible that >>>> the function might be inlined in default_do_nmi by the >>>> compiler. If this happens, fentry/nmi_handle will not >>>> be triggered and the test will pass. >>>> >>>> So I suggest to change the comment to >>>> nmi_handle() is a static function and might be >>>> inlined into its caller. If this happens, the >>>> test can still pass without previous kernel fix. >>> >>> It's worse than this. >>> fentry is buggy. >>> We shouldn't allow attaching fentry to: >>> NOKPROBE_SYMBOL(nmi_handle); >> >> Okay, I see. Looks we should prevent fentry from >> attaching any NOKPROBE_SYMBOL functions. >> >> BTW, I think fentry/nmi_handle can be replaced with >> tracepoint nmi/nmi_handler. it is more reliable > The tracepoint will not reproduce the deadlock(we have discussed v2). > If it's not easy to complete a test for this case, should we drop this > testcase patch? or fentry the nmi_handle and update the comments. could we use a softirq perf event (timer), e.g., struct perf_event_attr attr = { .sample_period = 1, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, }; then you can attach function hrtimer_run_softirq (not tested) or similar functions? I suspect most (if not all) functions in nmi path cannot be kprobe'd. >> and won't be impacted by potential NOKPROBE_SYMBOL >> issues. > > >
Hi, On 1/4/2023 3:09 PM, Yonghong Song wrote: > > > On 1/2/23 6:40 PM, Tonghao Zhang wrote: >> a >> >> On Thu, Dec 29, 2022 at 2:29 PM Yonghong Song <yhs@meta.com> wrote: >>> >>> >>> >>> On 12/28/22 2:24 PM, Alexei Starovoitov wrote: >>>> On Tue, Dec 27, 2022 at 8:43 PM Yonghong Song <yhs@meta.com> wrote: >>>>> >>>>> >>>>> >>>>> On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: >>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com> >>>>>> >>>>>> This testing show how to reproduce deadlock in special case. >>>>>> We update htab map in Task and NMI context. Task can be interrupted by >>>>>> NMI, if the same map bucket was locked, there will be a deadlock. >>>>>> >>>>>> * map max_entries is 2. >>>>>> * NMI using key 4 and Task context using key 20. >>>>>> * so same bucket index but map_locked index is different. >>>>>> >>>>>> The selftest use perf to produce the NMI and fentry nmi_handle. >>>>>> Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update >>>>>> map syscall increase this counter in bpf_disable_instrumentation. >>>>>> Then fentry nmi_handle and update hash map will reproduce the issue. SNIP >>>>>> diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>> b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>> new file mode 100644 >>>>>> index 000000000000..d394f95e97c3 >>>>>> --- /dev/null >>>>>> +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>> @@ -0,0 +1,32 @@ >>>>>> +// SPDX-License-Identifier: GPL-2.0 >>>>>> +/* Copyright (c) 2022 DiDi Global Inc. */ >>>>>> +#include <linux/bpf.h> >>>>>> +#include <bpf/bpf_helpers.h> >>>>>> +#include <bpf/bpf_tracing.h> >>>>>> + >>>>>> +char _license[] SEC("license") = "GPL"; >>>>>> + >>>>>> +struct { >>>>>> + __uint(type, BPF_MAP_TYPE_HASH); >>>>>> + __uint(max_entries, 2); >>>>>> + __uint(map_flags, BPF_F_ZERO_SEED); >>>>>> + __type(key, unsigned int); >>>>>> + __type(value, unsigned int); >>>>>> +} htab SEC(".maps"); >>>>>> + >>>>>> +/* nmi_handle on x86 platform. If changing keyword >>>>>> + * "static" to "inline", this prog load failed. */ >>>>>> +SEC("fentry/nmi_handle") >>>>> >>>>> The above comment is not what I mean. In arch/x86/kernel/nmi.c, >>>>> we have >>>>> static int nmi_handle(unsigned int type, struct pt_regs *regs) >>>>> { >>>>> ... >>>>> } >>>>> ... >>>>> static noinstr void default_do_nmi(struct pt_regs *regs) >>>>> { >>>>> ... >>>>> handled = nmi_handle(NMI_LOCAL, regs); >>>>> ... >>>>> } >>>>> >>>>> Since nmi_handle is a static function, it is possible that >>>>> the function might be inlined in default_do_nmi by the >>>>> compiler. If this happens, fentry/nmi_handle will not >>>>> be triggered and the test will pass. >>>>> >>>>> So I suggest to change the comment to >>>>> nmi_handle() is a static function and might be >>>>> inlined into its caller. If this happens, the >>>>> test can still pass without previous kernel fix. >>>> >>>> It's worse than this. >>>> fentry is buggy. >>>> We shouldn't allow attaching fentry to: >>>> NOKPROBE_SYMBOL(nmi_handle); >>> >>> Okay, I see. Looks we should prevent fentry from >>> attaching any NOKPROBE_SYMBOL functions. >>> >>> BTW, I think fentry/nmi_handle can be replaced with >>> tracepoint nmi/nmi_handler. it is more reliable >> The tracepoint will not reproduce the deadlock(we have discussed v2). >> If it's not easy to complete a test for this case, should we drop this >> testcase patch? or fentry the nmi_handle and update the comments. > > could we use a softirq perf event (timer), e.g., > > struct perf_event_attr attr = { > .sample_period = 1, > .type = PERF_TYPE_SOFTWARE, > .config = PERF_COUNT_SW_CPU_CLOCK, > }; > > then you can attach function hrtimer_run_softirq (not tested) or > similar functions? The context will be a hard-irq context, right ? Because htab_lock_bucket() has already disabled hard-irq on current CPU, so the dead-lock will be impossible. > > I suspect most (if not all) functions in nmi path cannot > be kprobe'd. It seems that perf_event_nmi_handler() is also nokprobe function. However I think we could try its callees (e.g., x86_pmu_handle_irq or perf_event_overflow). > >>> and won't be impacted by potential NOKPROBE_SYMBOL >>> issues. >> >> >> > .
On 1/3/23 11:51 PM, Hou Tao wrote: > Hi, > > On 1/4/2023 3:09 PM, Yonghong Song wrote: >> >> >> On 1/2/23 6:40 PM, Tonghao Zhang wrote: >>> a >>> >>> On Thu, Dec 29, 2022 at 2:29 PM Yonghong Song <yhs@meta.com> wrote: >>>> >>>> >>>> >>>> On 12/28/22 2:24 PM, Alexei Starovoitov wrote: >>>>> On Tue, Dec 27, 2022 at 8:43 PM Yonghong Song <yhs@meta.com> wrote: >>>>>> >>>>>> >>>>>> >>>>>> On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: >>>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com> >>>>>>> >>>>>>> This testing show how to reproduce deadlock in special case. >>>>>>> We update htab map in Task and NMI context. Task can be interrupted by >>>>>>> NMI, if the same map bucket was locked, there will be a deadlock. >>>>>>> >>>>>>> * map max_entries is 2. >>>>>>> * NMI using key 4 and Task context using key 20. >>>>>>> * so same bucket index but map_locked index is different. >>>>>>> >>>>>>> The selftest use perf to produce the NMI and fentry nmi_handle. >>>>>>> Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update >>>>>>> map syscall increase this counter in bpf_disable_instrumentation. >>>>>>> Then fentry nmi_handle and update hash map will reproduce the issue. > SNIP >>>>>>> diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>>> b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>>> new file mode 100644 >>>>>>> index 000000000000..d394f95e97c3 >>>>>>> --- /dev/null >>>>>>> +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>>> @@ -0,0 +1,32 @@ >>>>>>> +// SPDX-License-Identifier: GPL-2.0 >>>>>>> +/* Copyright (c) 2022 DiDi Global Inc. */ >>>>>>> +#include <linux/bpf.h> >>>>>>> +#include <bpf/bpf_helpers.h> >>>>>>> +#include <bpf/bpf_tracing.h> >>>>>>> + >>>>>>> +char _license[] SEC("license") = "GPL"; >>>>>>> + >>>>>>> +struct { >>>>>>> + __uint(type, BPF_MAP_TYPE_HASH); >>>>>>> + __uint(max_entries, 2); >>>>>>> + __uint(map_flags, BPF_F_ZERO_SEED); >>>>>>> + __type(key, unsigned int); >>>>>>> + __type(value, unsigned int); >>>>>>> +} htab SEC(".maps"); >>>>>>> + >>>>>>> +/* nmi_handle on x86 platform. If changing keyword >>>>>>> + * "static" to "inline", this prog load failed. */ >>>>>>> +SEC("fentry/nmi_handle") >>>>>> >>>>>> The above comment is not what I mean. In arch/x86/kernel/nmi.c, >>>>>> we have >>>>>> static int nmi_handle(unsigned int type, struct pt_regs *regs) >>>>>> { >>>>>> ... >>>>>> } >>>>>> ... >>>>>> static noinstr void default_do_nmi(struct pt_regs *regs) >>>>>> { >>>>>> ... >>>>>> handled = nmi_handle(NMI_LOCAL, regs); >>>>>> ... >>>>>> } >>>>>> >>>>>> Since nmi_handle is a static function, it is possible that >>>>>> the function might be inlined in default_do_nmi by the >>>>>> compiler. If this happens, fentry/nmi_handle will not >>>>>> be triggered and the test will pass. >>>>>> >>>>>> So I suggest to change the comment to >>>>>> nmi_handle() is a static function and might be >>>>>> inlined into its caller. If this happens, the >>>>>> test can still pass without previous kernel fix. >>>>> >>>>> It's worse than this. >>>>> fentry is buggy. >>>>> We shouldn't allow attaching fentry to: >>>>> NOKPROBE_SYMBOL(nmi_handle); >>>> >>>> Okay, I see. Looks we should prevent fentry from >>>> attaching any NOKPROBE_SYMBOL functions. >>>> >>>> BTW, I think fentry/nmi_handle can be replaced with >>>> tracepoint nmi/nmi_handler. it is more reliable >>> The tracepoint will not reproduce the deadlock(we have discussed v2). >>> If it's not easy to complete a test for this case, should we drop this >>> testcase patch? or fentry the nmi_handle and update the comments. >> >> could we use a softirq perf event (timer), e.g., >> >> struct perf_event_attr attr = { >> .sample_period = 1, >> .type = PERF_TYPE_SOFTWARE, >> .config = PERF_COUNT_SW_CPU_CLOCK, >> }; >> >> then you can attach function hrtimer_run_softirq (not tested) or >> similar functions? > The context will be a hard-irq context, right ? Because htab_lock_bucket() has > already disabled hard-irq on current CPU, so the dead-lock will be impossible. Okay, I see. soft-irq doesn't work. The only thing it works is nmi since it is non-masking. >> >> I suspect most (if not all) functions in nmi path cannot >> be kprobe'd. > It seems that perf_event_nmi_handler() is also nokprobe function. However I > think we could try its callees (e.g., x86_pmu_handle_irq or perf_event_overflow). If we can find a function in nmi handling path which is not marked as nonkprobe, sure, we can use that function for fentry. >> >>>> and won't be impacted by potential NOKPROBE_SYMBOL >>>> issues. >>> >>> >>> >> . >
On Wed, Jan 4, 2023 at 4:01 PM Yonghong Song <yhs@meta.com> wrote: > > > > On 1/3/23 11:51 PM, Hou Tao wrote: > > Hi, > > > > On 1/4/2023 3:09 PM, Yonghong Song wrote: > >> > >> > >> On 1/2/23 6:40 PM, Tonghao Zhang wrote: > >>> a > >>> > >>> On Thu, Dec 29, 2022 at 2:29 PM Yonghong Song <yhs@meta.com> wrote: > >>>> > >>>> > >>>> > >>>> On 12/28/22 2:24 PM, Alexei Starovoitov wrote: > >>>>> On Tue, Dec 27, 2022 at 8:43 PM Yonghong Song <yhs@meta.com> wrote: > >>>>>> > >>>>>> > >>>>>> > >>>>>> On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: > >>>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com> > >>>>>>> > >>>>>>> This testing show how to reproduce deadlock in special case. > >>>>>>> We update htab map in Task and NMI context. Task can be interrupted by > >>>>>>> NMI, if the same map bucket was locked, there will be a deadlock. > >>>>>>> > >>>>>>> * map max_entries is 2. > >>>>>>> * NMI using key 4 and Task context using key 20. > >>>>>>> * so same bucket index but map_locked index is different. > >>>>>>> > >>>>>>> The selftest use perf to produce the NMI and fentry nmi_handle. > >>>>>>> Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update > >>>>>>> map syscall increase this counter in bpf_disable_instrumentation. > >>>>>>> Then fentry nmi_handle and update hash map will reproduce the issue. > > SNIP > >>>>>>> diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c > >>>>>>> b/tools/testing/selftests/bpf/progs/htab_deadlock.c > >>>>>>> new file mode 100644 > >>>>>>> index 000000000000..d394f95e97c3 > >>>>>>> --- /dev/null > >>>>>>> +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c > >>>>>>> @@ -0,0 +1,32 @@ > >>>>>>> +// SPDX-License-Identifier: GPL-2.0 > >>>>>>> +/* Copyright (c) 2022 DiDi Global Inc. */ > >>>>>>> +#include <linux/bpf.h> > >>>>>>> +#include <bpf/bpf_helpers.h> > >>>>>>> +#include <bpf/bpf_tracing.h> > >>>>>>> + > >>>>>>> +char _license[] SEC("license") = "GPL"; > >>>>>>> + > >>>>>>> +struct { > >>>>>>> + __uint(type, BPF_MAP_TYPE_HASH); > >>>>>>> + __uint(max_entries, 2); > >>>>>>> + __uint(map_flags, BPF_F_ZERO_SEED); > >>>>>>> + __type(key, unsigned int); > >>>>>>> + __type(value, unsigned int); > >>>>>>> +} htab SEC(".maps"); > >>>>>>> + > >>>>>>> +/* nmi_handle on x86 platform. If changing keyword > >>>>>>> + * "static" to "inline", this prog load failed. */ > >>>>>>> +SEC("fentry/nmi_handle") > >>>>>> > >>>>>> The above comment is not what I mean. In arch/x86/kernel/nmi.c, > >>>>>> we have > >>>>>> static int nmi_handle(unsigned int type, struct pt_regs *regs) > >>>>>> { > >>>>>> ... > >>>>>> } > >>>>>> ... > >>>>>> static noinstr void default_do_nmi(struct pt_regs *regs) > >>>>>> { > >>>>>> ... > >>>>>> handled = nmi_handle(NMI_LOCAL, regs); > >>>>>> ... > >>>>>> } > >>>>>> > >>>>>> Since nmi_handle is a static function, it is possible that > >>>>>> the function might be inlined in default_do_nmi by the > >>>>>> compiler. If this happens, fentry/nmi_handle will not > >>>>>> be triggered and the test will pass. > >>>>>> > >>>>>> So I suggest to change the comment to > >>>>>> nmi_handle() is a static function and might be > >>>>>> inlined into its caller. If this happens, the > >>>>>> test can still pass without previous kernel fix. > >>>>> > >>>>> It's worse than this. > >>>>> fentry is buggy. > >>>>> We shouldn't allow attaching fentry to: > >>>>> NOKPROBE_SYMBOL(nmi_handle); > >>>> > >>>> Okay, I see. Looks we should prevent fentry from > >>>> attaching any NOKPROBE_SYMBOL functions. > >>>> > >>>> BTW, I think fentry/nmi_handle can be replaced with > >>>> tracepoint nmi/nmi_handler. it is more reliable > >>> The tracepoint will not reproduce the deadlock(we have discussed v2). > >>> If it's not easy to complete a test for this case, should we drop this > >>> testcase patch? or fentry the nmi_handle and update the comments. > >> > >> could we use a softirq perf event (timer), e.g., > >> > >> struct perf_event_attr attr = { > >> .sample_period = 1, > >> .type = PERF_TYPE_SOFTWARE, > >> .config = PERF_COUNT_SW_CPU_CLOCK, > >> }; > >> > >> then you can attach function hrtimer_run_softirq (not tested) or > >> similar functions? > > The context will be a hard-irq context, right ? Because htab_lock_bucket() has > > already disabled hard-irq on current CPU, so the dead-lock will be impossible. > > Okay, I see. soft-irq doesn't work. The only thing it works is nmi since > it is non-masking. > > >> > >> I suspect most (if not all) functions in nmi path cannot > >> be kprobe'd. > > It seems that perf_event_nmi_handler() is also nokprobe function. However I > > think we could try its callees (e.g., x86_pmu_handle_irq or perf_event_overflow). > > If we can find a function in nmi handling path which is not marked as > nonkprobe, sure, we can use that function for fentry. I think perf_event_overflow is ok. [ 93.233093] dump_stack_lvl+0x57/0x81 [ 93.233098] lock_acquire+0x1f4/0x29a [ 93.233103] ? htab_lock_bucket+0x61/0x6c [ 93.233108] _raw_spin_lock_irqsave+0x43/0x7f [ 93.233111] ? htab_lock_bucket+0x61/0x6c [ 93.233114] htab_lock_bucket+0x61/0x6c [ 93.233118] htab_map_update_elem+0x11e/0x220 [ 93.233124] bpf_prog_df326439468c24a9_bpf_prog1+0x41/0x45 [ 93.233137] bpf_trampoline_6442478975_0+0x48/0x1000 [ 93.233144] perf_event_overflow+0x5/0x15 [ 93.233149] handle_pmi_common+0x1ad/0x1f0 [ 93.233166] intel_pmu_handle_irq+0x136/0x18a [ 93.233170] perf_event_nmi_handler+0x28/0x47 [ 93.233176] nmi_handle+0xb8/0x254 [ 93.233182] default_do_nmi+0x3d/0xf6 [ 93.233187] exc_nmi+0xa1/0x109 [ 93.233191] end_repeat_nmi+0x16/0x67 > >> > >>>> and won't be impacted by potential NOKPROBE_SYMBOL > >>>> issues. > >>> > >>> > >>> > >> . > >
On 1/4/23 6:32 AM, Tonghao Zhang wrote: > On Wed, Jan 4, 2023 at 4:01 PM Yonghong Song <yhs@meta.com> wrote: >> >> >> >> On 1/3/23 11:51 PM, Hou Tao wrote: >>> Hi, >>> >>> On 1/4/2023 3:09 PM, Yonghong Song wrote: >>>> >>>> >>>> On 1/2/23 6:40 PM, Tonghao Zhang wrote: >>>>> a >>>>> >>>>> On Thu, Dec 29, 2022 at 2:29 PM Yonghong Song <yhs@meta.com> wrote: >>>>>> >>>>>> >>>>>> >>>>>> On 12/28/22 2:24 PM, Alexei Starovoitov wrote: >>>>>>> On Tue, Dec 27, 2022 at 8:43 PM Yonghong Song <yhs@meta.com> wrote: >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> On 12/18/22 8:15 PM, xiangxia.m.yue@gmail.com wrote: >>>>>>>>> From: Tonghao Zhang <xiangxia.m.yue@gmail.com> >>>>>>>>> >>>>>>>>> This testing show how to reproduce deadlock in special case. >>>>>>>>> We update htab map in Task and NMI context. Task can be interrupted by >>>>>>>>> NMI, if the same map bucket was locked, there will be a deadlock. >>>>>>>>> >>>>>>>>> * map max_entries is 2. >>>>>>>>> * NMI using key 4 and Task context using key 20. >>>>>>>>> * so same bucket index but map_locked index is different. >>>>>>>>> >>>>>>>>> The selftest use perf to produce the NMI and fentry nmi_handle. >>>>>>>>> Note that bpf_overflow_handler checks bpf_prog_active, but in bpf update >>>>>>>>> map syscall increase this counter in bpf_disable_instrumentation. >>>>>>>>> Then fentry nmi_handle and update hash map will reproduce the issue. >>> SNIP >>>>>>>>> diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>>>>> b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>>>>> new file mode 100644 >>>>>>>>> index 000000000000..d394f95e97c3 >>>>>>>>> --- /dev/null >>>>>>>>> +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c >>>>>>>>> @@ -0,0 +1,32 @@ >>>>>>>>> +// SPDX-License-Identifier: GPL-2.0 >>>>>>>>> +/* Copyright (c) 2022 DiDi Global Inc. */ >>>>>>>>> +#include <linux/bpf.h> >>>>>>>>> +#include <bpf/bpf_helpers.h> >>>>>>>>> +#include <bpf/bpf_tracing.h> >>>>>>>>> + >>>>>>>>> +char _license[] SEC("license") = "GPL"; >>>>>>>>> + >>>>>>>>> +struct { >>>>>>>>> + __uint(type, BPF_MAP_TYPE_HASH); >>>>>>>>> + __uint(max_entries, 2); >>>>>>>>> + __uint(map_flags, BPF_F_ZERO_SEED); >>>>>>>>> + __type(key, unsigned int); >>>>>>>>> + __type(value, unsigned int); >>>>>>>>> +} htab SEC(".maps"); >>>>>>>>> + >>>>>>>>> +/* nmi_handle on x86 platform. If changing keyword >>>>>>>>> + * "static" to "inline", this prog load failed. */ >>>>>>>>> +SEC("fentry/nmi_handle") >>>>>>>> >>>>>>>> The above comment is not what I mean. In arch/x86/kernel/nmi.c, >>>>>>>> we have >>>>>>>> static int nmi_handle(unsigned int type, struct pt_regs *regs) >>>>>>>> { >>>>>>>> ... >>>>>>>> } >>>>>>>> ... >>>>>>>> static noinstr void default_do_nmi(struct pt_regs *regs) >>>>>>>> { >>>>>>>> ... >>>>>>>> handled = nmi_handle(NMI_LOCAL, regs); >>>>>>>> ... >>>>>>>> } >>>>>>>> >>>>>>>> Since nmi_handle is a static function, it is possible that >>>>>>>> the function might be inlined in default_do_nmi by the >>>>>>>> compiler. If this happens, fentry/nmi_handle will not >>>>>>>> be triggered and the test will pass. >>>>>>>> >>>>>>>> So I suggest to change the comment to >>>>>>>> nmi_handle() is a static function and might be >>>>>>>> inlined into its caller. If this happens, the >>>>>>>> test can still pass without previous kernel fix. >>>>>>> >>>>>>> It's worse than this. >>>>>>> fentry is buggy. >>>>>>> We shouldn't allow attaching fentry to: >>>>>>> NOKPROBE_SYMBOL(nmi_handle); >>>>>> >>>>>> Okay, I see. Looks we should prevent fentry from >>>>>> attaching any NOKPROBE_SYMBOL functions. >>>>>> >>>>>> BTW, I think fentry/nmi_handle can be replaced with >>>>>> tracepoint nmi/nmi_handler. it is more reliable >>>>> The tracepoint will not reproduce the deadlock(we have discussed v2). >>>>> If it's not easy to complete a test for this case, should we drop this >>>>> testcase patch? or fentry the nmi_handle and update the comments. >>>> >>>> could we use a softirq perf event (timer), e.g., >>>> >>>> struct perf_event_attr attr = { >>>> .sample_period = 1, >>>> .type = PERF_TYPE_SOFTWARE, >>>> .config = PERF_COUNT_SW_CPU_CLOCK, >>>> }; >>>> >>>> then you can attach function hrtimer_run_softirq (not tested) or >>>> similar functions? >>> The context will be a hard-irq context, right ? Because htab_lock_bucket() has >>> already disabled hard-irq on current CPU, so the dead-lock will be impossible. >> >> Okay, I see. soft-irq doesn't work. The only thing it works is nmi since >> it is non-masking. >> >>>> >>>> I suspect most (if not all) functions in nmi path cannot >>>> be kprobe'd. >>> It seems that perf_event_nmi_handler() is also nokprobe function. However I >>> think we could try its callees (e.g., x86_pmu_handle_irq or perf_event_overflow). >> >> If we can find a function in nmi handling path which is not marked as >> nonkprobe, sure, we can use that function for fentry. > I think perf_event_overflow is ok. > [ 93.233093] dump_stack_lvl+0x57/0x81 > [ 93.233098] lock_acquire+0x1f4/0x29a > [ 93.233103] ? htab_lock_bucket+0x61/0x6c > [ 93.233108] _raw_spin_lock_irqsave+0x43/0x7f > [ 93.233111] ? htab_lock_bucket+0x61/0x6c > [ 93.233114] htab_lock_bucket+0x61/0x6c > [ 93.233118] htab_map_update_elem+0x11e/0x220 > [ 93.233124] bpf_prog_df326439468c24a9_bpf_prog1+0x41/0x45 > [ 93.233137] bpf_trampoline_6442478975_0+0x48/0x1000 > [ 93.233144] perf_event_overflow+0x5/0x15 > [ 93.233149] handle_pmi_common+0x1ad/0x1f0 > [ 93.233166] intel_pmu_handle_irq+0x136/0x18a > [ 93.233170] perf_event_nmi_handler+0x28/0x47 > [ 93.233176] nmi_handle+0xb8/0x254 > [ 93.233182] default_do_nmi+0x3d/0xf6 > [ 93.233187] exc_nmi+0xa1/0x109 > [ 93.233191] end_repeat_nmi+0x16/0x67 sounds good. I agree fentry/perf_event_overflow can be used for the test. >>>> >>>>>> and won't be impacted by potential NOKPROBE_SYMBOL >>>>>> issues. >>>>> >>>>> >>>>> >>>> . >>> > > >
diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 99cc33c51eaa..87e8fc9c9df2 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -24,6 +24,7 @@ fexit_test # fexit_attach unexpected error get_func_args_test # get_func_args_test__attach unexpected error: -524 (errno 524) (trampoline) get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (errno 524) (trampoline) htab_update/reenter_update +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) kfree_skb # attach fentry unexpected error: -524 (trampoline) kfunc_call/subprog # extern (var ksym) 'bpf_prog_active': not found in kernel BTF kfunc_call/subprog_lskel # skel unexpected error: -2 diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 585fcf73c731..735239b31050 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -26,6 +26,7 @@ get_func_args_test # trampoline get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (trampoline) get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) +htab_deadlock # failed to find kernel BTF type ID of 'nmi_handle': -3 (trampoline) kfree_skb # attach fentry unexpected error: -524 (trampoline) kfunc_call # 'bpf_prog_active': not found in kernel BTF (?) kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) diff --git a/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c new file mode 100644 index 000000000000..137dce8f1346 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/htab_deadlock.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 DiDi Global Inc. */ +#define _GNU_SOURCE +#include <pthread.h> +#include <sched.h> +#include <test_progs.h> + +#include "htab_deadlock.skel.h" + +static int perf_event_open(void) +{ + struct perf_event_attr attr = {0}; + int pfd; + + /* create perf event on CPU 0 */ + attr.size = sizeof(attr); + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_CPU_CYCLES; + attr.freq = 1; + attr.sample_freq = 1000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + + return pfd >= 0 ? pfd : -errno; +} + +void test_htab_deadlock(void) +{ + unsigned int val = 0, key = 20; + struct bpf_link *link = NULL; + struct htab_deadlock *skel; + int err, i, pfd; + cpu_set_t cpus; + + skel = htab_deadlock__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = htab_deadlock__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto clean_skel; + + /* NMI events. */ + pfd = perf_event_open(); + if (pfd < 0) { + if (pfd == -ENOENT || pfd == -EOPNOTSUPP) { + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); + test__skip(); + goto clean_skel; + } + if (!ASSERT_GE(pfd, 0, "perf_event_open")) + goto clean_skel; + } + + link = bpf_program__attach_perf_event(skel->progs.bpf_empty, pfd); + if (!ASSERT_OK_PTR(link, "attach_perf_event")) + goto clean_pfd; + + /* Pinned on CPU 0 */ + CPU_ZERO(&cpus); + CPU_SET(0, &cpus); + pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus); + + /* update bpf map concurrently on CPU0 in NMI and Task context. + * there should be no kernel deadlock. + */ + for (i = 0; i < 100000; i++) + bpf_map_update_elem(bpf_map__fd(skel->maps.htab), + &key, &val, BPF_ANY); + + bpf_link__destroy(link); +clean_pfd: + close(pfd); +clean_skel: + htab_deadlock__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/htab_deadlock.c b/tools/testing/selftests/bpf/progs/htab_deadlock.c new file mode 100644 index 000000000000..d394f95e97c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/htab_deadlock.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 DiDi Global Inc. */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 2); + __uint(map_flags, BPF_F_ZERO_SEED); + __type(key, unsigned int); + __type(value, unsigned int); +} htab SEC(".maps"); + +/* nmi_handle on x86 platform. If changing keyword + * "static" to "inline", this prog load failed. */ +SEC("fentry/nmi_handle") +int bpf_nmi_handle(struct pt_regs *regs) +{ + unsigned int val = 0, key = 4; + + bpf_map_update_elem(&htab, &key, &val, BPF_ANY); + return 0; +} + +SEC("perf_event") +int bpf_empty(struct pt_regs *regs) +{ + return 0; +}