Message ID | 20210817172009.2770161-1-yhs@fb.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | selftests/bpf: fix flaky send_signal test | expand |
Context | Check | Description |
---|---|---|
netdev/cover_letter | success | Link |
netdev/fixes_present | success | Link |
netdev/patch_count | success | Link |
netdev/tree_selection | success | Clearly marked for bpf-next |
netdev/subject_prefix | success | Link |
netdev/cc_maintainers | warning | 7 maintainers not CCed: songliubraving@fb.com shuah@kernel.org kafai@fb.com netdev@vger.kernel.org john.fastabend@gmail.com linux-kselftest@vger.kernel.org kpsingh@kernel.org |
netdev/source_inline | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Link |
netdev/module_param | success | Was 0 now: 0 |
netdev/build_32bit | success | Errors and warnings before: 0 this patch: 0 |
netdev/kdoc | success | Errors and warnings before: 0 this patch: 0 |
netdev/verify_fixes | success | Link |
netdev/checkpatch | warning | WARNING: line length of 81 exceeds 80 columns |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 0 this patch: 0 |
netdev/header_inline | success | Link |
On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote: > > libbpf CI has reported send_signal test is flaky although > I am not able to reproduce it in my local environment. > But I am able to reproduce with on-demand libbpf CI ([1]). > > Through code analysis, the following is possible reason. > The failed subtest runs bpf program in softirq environment. > Since bpf_send_signal() only sends to a fork of "test_progs" > process. If the underlying current task is > not "test_progs", bpf_send_signal() will not be triggered > and the subtest will fail. > > To reduce the chances where the underlying process is not > the intended one, this patch boosted scheduling priority to > -20 (highest allowed by setpriority() call). And I did > 10 runs with on-demand libbpf CI with this patch and I > didn't observe any failures. > > [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml > > Signed-off-by: Yonghong Song <yhs@fb.com> > --- > .../selftests/bpf/prog_tests/send_signal.c | 33 +++++++++++++++---- > .../bpf/progs/test_send_signal_kern.c | 3 +- > 2 files changed, 28 insertions(+), 8 deletions(-) > > diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c > index 41e158ae888e..0701c97456da 100644 > --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c > +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c > @@ -1,5 +1,7 @@ > // SPDX-License-Identifier: GPL-2.0 > #include <test_progs.h> > +#include <sys/time.h> > +#include <sys/resource.h> > #include "test_send_signal_kern.skel.h" > > int sigusr1_received = 0; > @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum) > } > > static void test_send_signal_common(struct perf_event_attr *attr, > - bool signal_thread) > + bool signal_thread, bool allow_skip) > { > struct test_send_signal_kern *skel; > int pipe_c2p[2], pipe_p2c[2]; > @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr, > } > > if (pid == 0) { > + int old_prio; > + > /* install signal handler and notify parent */ > signal(SIGUSR1, sigusr1_handler); > > close(pipe_c2p[0]); /* close read */ > close(pipe_p2c[1]); /* close write */ > > + /* boost with a high priority so we got a higher chance > + * that if an interrupt happens, the underlying task > + * is this process. > + */ > + errno = 0; > + old_prio = getpriority(PRIO_PROCESS, 0); > + ASSERT_OK(errno, "getpriority"); > + ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority"); > + > /* notify parent signal handler is installed */ > ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); > > @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr, > /* wait for parent notification and exit */ > ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); > > + /* restore the old priority */ > + ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority"); > + > close(pipe_c2p[1]); > close(pipe_p2c[0]); > exit(0); > @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr, > goto disable_pmu; > } > > - ASSERT_EQ(buf[0], '2', "incorrect result"); > - > /* notify child safe to exit */ > ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); > > + if (skel->bss->status == 0 && allow_skip) { > + printf("%s:SKIP\n", __func__); > + test__skip(); > + } else if (skel->bss->status != 1) { > + ASSERT_EQ(buf[0], '2', "incorrect result"); > + } > + > disable_pmu: > close(pmu_fd); > destroy_skel: > @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, > > static void test_send_signal_tracepoint(bool signal_thread) > { > - test_send_signal_common(NULL, signal_thread); > + test_send_signal_common(NULL, signal_thread, false); > } > > static void test_send_signal_perf(bool signal_thread) > @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread) > .config = PERF_COUNT_SW_CPU_CLOCK, > }; > > - test_send_signal_common(&attr, signal_thread); > + test_send_signal_common(&attr, signal_thread, true); > } > > static void test_send_signal_nmi(bool signal_thread) > @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread) > close(pmu_fd); > } > > - test_send_signal_common(&attr, signal_thread); > + test_send_signal_common(&attr, signal_thread, true); > } > > void test_send_signal(void) > diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > index b4233d3efac2..59c05c422bbd 100644 > --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx) > ret = bpf_send_signal_thread(sig); > else > ret = bpf_send_signal(sig); > - if (ret == 0) > - status = 1; > + status = (ret == 0) ? 1 : 2; This doesn't make sense to me. status == 0 is the default value, it will stay 0 even if nothing is triggered, no BPF program is called, etc. If we are doing the skipping of the test logic (which I'd honestly just not do right now to see if we actually fixed the test), then I'd set status = 3 for the case when signal was triggered, but the current task is not test_progs. And only skip test if we get status 3. That is, status 0 and status 2 are bad (either not triggered, or some error when sending signal), 1 is OK, 3 is SKIP. But really, skipping a test that we couldn't randomly run doesn't feel good. Can you please leave the priority boosting part and drop the skipping part for now? > } > > return 0; > -- > 2.30.2 >
On 8/17/21 11:45 AM, Andrii Nakryiko wrote: > On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote: >> >> libbpf CI has reported send_signal test is flaky although >> I am not able to reproduce it in my local environment. >> But I am able to reproduce with on-demand libbpf CI ([1]). >> >> Through code analysis, the following is possible reason. >> The failed subtest runs bpf program in softirq environment. >> Since bpf_send_signal() only sends to a fork of "test_progs" >> process. If the underlying current task is >> not "test_progs", bpf_send_signal() will not be triggered >> and the subtest will fail. >> >> To reduce the chances where the underlying process is not >> the intended one, this patch boosted scheduling priority to >> -20 (highest allowed by setpriority() call). And I did >> 10 runs with on-demand libbpf CI with this patch and I >> didn't observe any failures. >> >> [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml >> >> Signed-off-by: Yonghong Song <yhs@fb.com> >> --- >> .../selftests/bpf/prog_tests/send_signal.c | 33 +++++++++++++++---- >> .../bpf/progs/test_send_signal_kern.c | 3 +- >> 2 files changed, 28 insertions(+), 8 deletions(-) >> >> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c >> index 41e158ae888e..0701c97456da 100644 >> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c >> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c >> @@ -1,5 +1,7 @@ >> // SPDX-License-Identifier: GPL-2.0 >> #include <test_progs.h> >> +#include <sys/time.h> >> +#include <sys/resource.h> >> #include "test_send_signal_kern.skel.h" >> >> int sigusr1_received = 0; >> @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum) >> } >> >> static void test_send_signal_common(struct perf_event_attr *attr, >> - bool signal_thread) >> + bool signal_thread, bool allow_skip) >> { >> struct test_send_signal_kern *skel; >> int pipe_c2p[2], pipe_p2c[2]; >> @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr, >> } >> >> if (pid == 0) { >> + int old_prio; >> + >> /* install signal handler and notify parent */ >> signal(SIGUSR1, sigusr1_handler); >> >> close(pipe_c2p[0]); /* close read */ >> close(pipe_p2c[1]); /* close write */ >> >> + /* boost with a high priority so we got a higher chance >> + * that if an interrupt happens, the underlying task >> + * is this process. >> + */ >> + errno = 0; >> + old_prio = getpriority(PRIO_PROCESS, 0); >> + ASSERT_OK(errno, "getpriority"); >> + ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority"); >> + >> /* notify parent signal handler is installed */ >> ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); >> >> @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr, >> /* wait for parent notification and exit */ >> ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); >> >> + /* restore the old priority */ >> + ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority"); >> + >> close(pipe_c2p[1]); >> close(pipe_p2c[0]); >> exit(0); >> @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr, >> goto disable_pmu; >> } >> >> - ASSERT_EQ(buf[0], '2', "incorrect result"); >> - >> /* notify child safe to exit */ >> ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); >> >> + if (skel->bss->status == 0 && allow_skip) { >> + printf("%s:SKIP\n", __func__); >> + test__skip(); >> + } else if (skel->bss->status != 1) { >> + ASSERT_EQ(buf[0], '2', "incorrect result"); >> + } >> + >> disable_pmu: >> close(pmu_fd); >> destroy_skel: >> @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, >> >> static void test_send_signal_tracepoint(bool signal_thread) >> { >> - test_send_signal_common(NULL, signal_thread); >> + test_send_signal_common(NULL, signal_thread, false); >> } >> >> static void test_send_signal_perf(bool signal_thread) >> @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread) >> .config = PERF_COUNT_SW_CPU_CLOCK, >> }; >> >> - test_send_signal_common(&attr, signal_thread); >> + test_send_signal_common(&attr, signal_thread, true); >> } >> >> static void test_send_signal_nmi(bool signal_thread) >> @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread) >> close(pmu_fd); >> } >> >> - test_send_signal_common(&attr, signal_thread); >> + test_send_signal_common(&attr, signal_thread, true); >> } >> >> void test_send_signal(void) >> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c >> index b4233d3efac2..59c05c422bbd 100644 >> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c >> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c >> @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx) >> ret = bpf_send_signal_thread(sig); >> else >> ret = bpf_send_signal(sig); >> - if (ret == 0) >> - status = 1; >> + status = (ret == 0) ? 1 : 2; > > This doesn't make sense to me. status == 0 is the default value, it > will stay 0 even if nothing is triggered, no BPF program is called, > etc. that is true. > > If we are doing the skipping of the test logic (which I'd honestly > just not do right now to see if we actually fixed the test), then I'd > set status = 3 for the case when signal was triggered, but the current > task is not test_progs. And only skip test if we get status 3. That > is, status 0 and status 2 are bad (either not triggered, or some error > when sending signal), 1 is OK, 3 is SKIP. Here, we *assume* bpf program always got called which should be the case unless softirq/nmi logic goes wrong. so status = 0 means pid doesn't match, and status = 1 means good bpf_send_signal happens, status = 2 means bpf_send_signal helper fails. > > But really, skipping a test that we couldn't randomly run doesn't feel > good. Can you please leave the priority boosting part and drop the > skipping part for now? Sure. Let me drop skipping part. With the patch, I am expecting in *most* cases, we should not observe flakiness. > >> } >> >> return 0; >> -- >> 2.30.2 >>
On Tue, Aug 17, 2021 at 12:01 PM Yonghong Song <yhs@fb.com> wrote: > > > > On 8/17/21 11:45 AM, Andrii Nakryiko wrote: > > On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote: > >> > >> libbpf CI has reported send_signal test is flaky although > >> I am not able to reproduce it in my local environment. > >> But I am able to reproduce with on-demand libbpf CI ([1]). > >> > >> Through code analysis, the following is possible reason. > >> The failed subtest runs bpf program in softirq environment. > >> Since bpf_send_signal() only sends to a fork of "test_progs" > >> process. If the underlying current task is > >> not "test_progs", bpf_send_signal() will not be triggered > >> and the subtest will fail. > >> > >> To reduce the chances where the underlying process is not > >> the intended one, this patch boosted scheduling priority to > >> -20 (highest allowed by setpriority() call). And I did > >> 10 runs with on-demand libbpf CI with this patch and I > >> didn't observe any failures. > >> > >> [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml > >> > >> Signed-off-by: Yonghong Song <yhs@fb.com> > >> --- > >> .../selftests/bpf/prog_tests/send_signal.c | 33 +++++++++++++++---- > >> .../bpf/progs/test_send_signal_kern.c | 3 +- > >> 2 files changed, 28 insertions(+), 8 deletions(-) > >> > >> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c > >> index 41e158ae888e..0701c97456da 100644 > >> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c > >> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c > >> @@ -1,5 +1,7 @@ > >> // SPDX-License-Identifier: GPL-2.0 > >> #include <test_progs.h> > >> +#include <sys/time.h> > >> +#include <sys/resource.h> > >> #include "test_send_signal_kern.skel.h" > >> > >> int sigusr1_received = 0; > >> @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum) > >> } > >> > >> static void test_send_signal_common(struct perf_event_attr *attr, > >> - bool signal_thread) > >> + bool signal_thread, bool allow_skip) > >> { > >> struct test_send_signal_kern *skel; > >> int pipe_c2p[2], pipe_p2c[2]; > >> @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr, > >> } > >> > >> if (pid == 0) { > >> + int old_prio; > >> + > >> /* install signal handler and notify parent */ > >> signal(SIGUSR1, sigusr1_handler); > >> > >> close(pipe_c2p[0]); /* close read */ > >> close(pipe_p2c[1]); /* close write */ > >> > >> + /* boost with a high priority so we got a higher chance > >> + * that if an interrupt happens, the underlying task > >> + * is this process. > >> + */ > >> + errno = 0; > >> + old_prio = getpriority(PRIO_PROCESS, 0); > >> + ASSERT_OK(errno, "getpriority"); > >> + ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority"); > >> + > >> /* notify parent signal handler is installed */ > >> ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); > >> > >> @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr, > >> /* wait for parent notification and exit */ > >> ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); > >> > >> + /* restore the old priority */ > >> + ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority"); > >> + > >> close(pipe_c2p[1]); > >> close(pipe_p2c[0]); > >> exit(0); > >> @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr, > >> goto disable_pmu; > >> } > >> > >> - ASSERT_EQ(buf[0], '2', "incorrect result"); > >> - > >> /* notify child safe to exit */ > >> ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); > >> > >> + if (skel->bss->status == 0 && allow_skip) { > >> + printf("%s:SKIP\n", __func__); > >> + test__skip(); > >> + } else if (skel->bss->status != 1) { > >> + ASSERT_EQ(buf[0], '2', "incorrect result"); > >> + } > >> + > >> disable_pmu: > >> close(pmu_fd); > >> destroy_skel: > >> @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, > >> > >> static void test_send_signal_tracepoint(bool signal_thread) > >> { > >> - test_send_signal_common(NULL, signal_thread); > >> + test_send_signal_common(NULL, signal_thread, false); > >> } > >> > >> static void test_send_signal_perf(bool signal_thread) > >> @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread) > >> .config = PERF_COUNT_SW_CPU_CLOCK, > >> }; > >> > >> - test_send_signal_common(&attr, signal_thread); > >> + test_send_signal_common(&attr, signal_thread, true); > >> } > >> > >> static void test_send_signal_nmi(bool signal_thread) > >> @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread) > >> close(pmu_fd); > >> } > >> > >> - test_send_signal_common(&attr, signal_thread); > >> + test_send_signal_common(&attr, signal_thread, true); > >> } > >> > >> void test_send_signal(void) > >> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > >> index b4233d3efac2..59c05c422bbd 100644 > >> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > >> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > >> @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx) > >> ret = bpf_send_signal_thread(sig); > >> else > >> ret = bpf_send_signal(sig); > >> - if (ret == 0) > >> - status = 1; > >> + status = (ret == 0) ? 1 : 2; > > > > This doesn't make sense to me. status == 0 is the default value, it > > will stay 0 even if nothing is triggered, no BPF program is called, > > etc. > > that is true. > > > > > If we are doing the skipping of the test logic (which I'd honestly > > just not do right now to see if we actually fixed the test), then I'd > > set status = 3 for the case when signal was triggered, but the current > > task is not test_progs. And only skip test if we get status 3. That > > is, status 0 and status 2 are bad (either not triggered, or some error > > when sending signal), 1 is OK, 3 is SKIP. > > Here, we *assume* bpf program always got called which should be the case > unless softirq/nmi logic goes wrong. so status = 0 means > pid doesn't match, and status = 1 means good bpf_send_signal happens, > status = 2 means bpf_send_signal helper fails. Sorry, I didn't make my point clear. I meant that test shouldn't just assume that BPF program ran, so I'd add if ((bpf_get_current_pid_tgid() >> 32) == pid) { ... } else { status = 3; } Just to capture that we did get bpf_send_signal_test() called, but we didn't have correct current. But it doesn't matter for now, I'd like to see if prio games get us to stable tests with no skipping first. > > > > > But really, skipping a test that we couldn't randomly run doesn't feel > > good. Can you please leave the priority boosting part and drop the > > skipping part for now? > > Sure. Let me drop skipping part. With the patch, I am expecting in > *most* cases, we should not observe flakiness. Yep, thanks! > > > > >> } > >> > >> return 0; > >> -- > >> 2.30.2 > >>
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index 41e158ae888e..0701c97456da 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <sys/time.h> +#include <sys/resource.h> #include "test_send_signal_kern.skel.h" int sigusr1_received = 0; @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum) } static void test_send_signal_common(struct perf_event_attr *attr, - bool signal_thread) + bool signal_thread, bool allow_skip) { struct test_send_signal_kern *skel; int pipe_c2p[2], pipe_p2c[2]; @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr, } if (pid == 0) { + int old_prio; + /* install signal handler and notify parent */ signal(SIGUSR1, sigusr1_handler); close(pipe_c2p[0]); /* close read */ close(pipe_p2c[1]); /* close write */ + /* boost with a high priority so we got a higher chance + * that if an interrupt happens, the underlying task + * is this process. + */ + errno = 0; + old_prio = getpriority(PRIO_PROCESS, 0); + ASSERT_OK(errno, "getpriority"); + ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority"); + /* notify parent signal handler is installed */ ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr, /* wait for parent notification and exit */ ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); + /* restore the old priority */ + ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority"); + close(pipe_c2p[1]); close(pipe_p2c[0]); exit(0); @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr, goto disable_pmu; } - ASSERT_EQ(buf[0], '2', "incorrect result"); - /* notify child safe to exit */ ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); + if (skel->bss->status == 0 && allow_skip) { + printf("%s:SKIP\n", __func__); + test__skip(); + } else if (skel->bss->status != 1) { + ASSERT_EQ(buf[0], '2', "incorrect result"); + } + disable_pmu: close(pmu_fd); destroy_skel: @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, static void test_send_signal_tracepoint(bool signal_thread) { - test_send_signal_common(NULL, signal_thread); + test_send_signal_common(NULL, signal_thread, false); } static void test_send_signal_perf(bool signal_thread) @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread) .config = PERF_COUNT_SW_CPU_CLOCK, }; - test_send_signal_common(&attr, signal_thread); + test_send_signal_common(&attr, signal_thread, true); } static void test_send_signal_nmi(bool signal_thread) @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread) close(pmu_fd); } - test_send_signal_common(&attr, signal_thread); + test_send_signal_common(&attr, signal_thread, true); } void test_send_signal(void) diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c index b4233d3efac2..59c05c422bbd 100644 --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx) ret = bpf_send_signal_thread(sig); else ret = bpf_send_signal(sig); - if (ret == 0) - status = 1; + status = (ret == 0) ? 1 : 2; } return 0;
libbpf CI has reported send_signal test is flaky although I am not able to reproduce it in my local environment. But I am able to reproduce with on-demand libbpf CI ([1]). Through code analysis, the following is possible reason. The failed subtest runs bpf program in softirq environment. Since bpf_send_signal() only sends to a fork of "test_progs" process. If the underlying current task is not "test_progs", bpf_send_signal() will not be triggered and the subtest will fail. To reduce the chances where the underlying process is not the intended one, this patch boosted scheduling priority to -20 (highest allowed by setpriority() call). And I did 10 runs with on-demand libbpf CI with this patch and I didn't observe any failures. [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml Signed-off-by: Yonghong Song <yhs@fb.com> --- .../selftests/bpf/prog_tests/send_signal.c | 33 +++++++++++++++---- .../bpf/progs/test_send_signal_kern.c | 3 +- 2 files changed, 28 insertions(+), 8 deletions(-)