diff mbox series

[bpf-next,2/2] selftests/bpf: fix flaky send_signal test

Message ID 20210817172009.2770161-1-yhs@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series selftests/bpf: fix flaky send_signal test | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 7 maintainers not CCed: songliubraving@fb.com shuah@kernel.org kafai@fb.com netdev@vger.kernel.org john.fastabend@gmail.com linux-kselftest@vger.kernel.org kpsingh@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Yonghong Song Aug. 17, 2021, 5:20 p.m. UTC
libbpf CI has reported send_signal test is flaky although
I am not able to reproduce it in my local environment.
But I am able to reproduce with on-demand libbpf CI ([1]).

Through code analysis, the following is possible reason.
The failed subtest runs bpf program in softirq environment.
Since bpf_send_signal() only sends to a fork of "test_progs"
process. If the underlying current task is
not "test_progs", bpf_send_signal() will not be triggered
and the subtest will fail.

To reduce the chances where the underlying process is not
the intended one, this patch boosted scheduling priority to
-20 (highest allowed by setpriority() call). And I did
10 runs with on-demand libbpf CI with this patch and I
didn't observe any failures.

 [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/prog_tests/send_signal.c    | 33 +++++++++++++++----
 .../bpf/progs/test_send_signal_kern.c         |  3 +-
 2 files changed, 28 insertions(+), 8 deletions(-)

Comments

Andrii Nakryiko Aug. 17, 2021, 6:45 p.m. UTC | #1
On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote:
>
> libbpf CI has reported send_signal test is flaky although
> I am not able to reproduce it in my local environment.
> But I am able to reproduce with on-demand libbpf CI ([1]).
>
> Through code analysis, the following is possible reason.
> The failed subtest runs bpf program in softirq environment.
> Since bpf_send_signal() only sends to a fork of "test_progs"
> process. If the underlying current task is
> not "test_progs", bpf_send_signal() will not be triggered
> and the subtest will fail.
>
> To reduce the chances where the underlying process is not
> the intended one, this patch boosted scheduling priority to
> -20 (highest allowed by setpriority() call). And I did
> 10 runs with on-demand libbpf CI with this patch and I
> didn't observe any failures.
>
>  [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  .../selftests/bpf/prog_tests/send_signal.c    | 33 +++++++++++++++----
>  .../bpf/progs/test_send_signal_kern.c         |  3 +-
>  2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> index 41e158ae888e..0701c97456da 100644
> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> @@ -1,5 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include <test_progs.h>
> +#include <sys/time.h>
> +#include <sys/resource.h>
>  #include "test_send_signal_kern.skel.h"
>
>  int sigusr1_received = 0;
> @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum)
>  }
>
>  static void test_send_signal_common(struct perf_event_attr *attr,
> -                                   bool signal_thread)
> +                                   bool signal_thread, bool allow_skip)
>  {
>         struct test_send_signal_kern *skel;
>         int pipe_c2p[2], pipe_p2c[2];
> @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>         }
>
>         if (pid == 0) {
> +               int old_prio;
> +
>                 /* install signal handler and notify parent */
>                 signal(SIGUSR1, sigusr1_handler);
>
>                 close(pipe_c2p[0]); /* close read */
>                 close(pipe_p2c[1]); /* close write */
>
> +               /* boost with a high priority so we got a higher chance
> +                * that if an interrupt happens, the underlying task
> +                * is this process.
> +                */
> +               errno = 0;
> +               old_prio = getpriority(PRIO_PROCESS, 0);
> +               ASSERT_OK(errno, "getpriority");
> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
> +
>                 /* notify parent signal handler is installed */
>                 ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
>
> @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>                 /* wait for parent notification and exit */
>                 ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
>
> +               /* restore the old priority */
> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
> +
>                 close(pipe_c2p[1]);
>                 close(pipe_p2c[0]);
>                 exit(0);
> @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>                 goto disable_pmu;
>         }
>
> -       ASSERT_EQ(buf[0], '2', "incorrect result");
> -
>         /* notify child safe to exit */
>         ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
>
> +       if (skel->bss->status == 0 && allow_skip) {
> +               printf("%s:SKIP\n", __func__);
> +               test__skip();
> +       } else if (skel->bss->status != 1) {
> +               ASSERT_EQ(buf[0], '2', "incorrect result");
> +       }
> +
>  disable_pmu:
>         close(pmu_fd);
>  destroy_skel:
> @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>
>  static void test_send_signal_tracepoint(bool signal_thread)
>  {
> -       test_send_signal_common(NULL, signal_thread);
> +       test_send_signal_common(NULL, signal_thread, false);
>  }
>
>  static void test_send_signal_perf(bool signal_thread)
> @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread)
>                 .config = PERF_COUNT_SW_CPU_CLOCK,
>         };
>
> -       test_send_signal_common(&attr, signal_thread);
> +       test_send_signal_common(&attr, signal_thread, true);
>  }
>
>  static void test_send_signal_nmi(bool signal_thread)
> @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread)
>                 close(pmu_fd);
>         }
>
> -       test_send_signal_common(&attr, signal_thread);
> +       test_send_signal_common(&attr, signal_thread, true);
>  }
>
>  void test_send_signal(void)
> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> index b4233d3efac2..59c05c422bbd 100644
> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx)
>                         ret = bpf_send_signal_thread(sig);
>                 else
>                         ret = bpf_send_signal(sig);
> -               if (ret == 0)
> -                       status = 1;
> +               status = (ret == 0) ? 1 : 2;

This doesn't make sense to me. status == 0 is the default value, it
will stay 0 even if nothing is triggered, no BPF program is called,
etc.

If we are doing the skipping of the test logic (which I'd honestly
just not do right now to see if we actually fixed the test), then I'd
set status = 3 for the case when signal was triggered, but the current
task is not test_progs. And only skip test if we get status 3. That
is, status 0 and status 2 are bad (either not triggered, or some error
when sending signal), 1 is OK, 3 is SKIP.

But really, skipping a test that we couldn't randomly run doesn't feel
good. Can you please leave the priority boosting part and drop the
skipping part for now?

>         }
>
>         return 0;
> --
> 2.30.2
>
Yonghong Song Aug. 17, 2021, 7:01 p.m. UTC | #2
On 8/17/21 11:45 AM, Andrii Nakryiko wrote:
> On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote:
>>
>> libbpf CI has reported send_signal test is flaky although
>> I am not able to reproduce it in my local environment.
>> But I am able to reproduce with on-demand libbpf CI ([1]).
>>
>> Through code analysis, the following is possible reason.
>> The failed subtest runs bpf program in softirq environment.
>> Since bpf_send_signal() only sends to a fork of "test_progs"
>> process. If the underlying current task is
>> not "test_progs", bpf_send_signal() will not be triggered
>> and the subtest will fail.
>>
>> To reduce the chances where the underlying process is not
>> the intended one, this patch boosted scheduling priority to
>> -20 (highest allowed by setpriority() call). And I did
>> 10 runs with on-demand libbpf CI with this patch and I
>> didn't observe any failures.
>>
>>   [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   .../selftests/bpf/prog_tests/send_signal.c    | 33 +++++++++++++++----
>>   .../bpf/progs/test_send_signal_kern.c         |  3 +-
>>   2 files changed, 28 insertions(+), 8 deletions(-)
>>
>> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> index 41e158ae888e..0701c97456da 100644
>> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> @@ -1,5 +1,7 @@
>>   // SPDX-License-Identifier: GPL-2.0
>>   #include <test_progs.h>
>> +#include <sys/time.h>
>> +#include <sys/resource.h>
>>   #include "test_send_signal_kern.skel.h"
>>
>>   int sigusr1_received = 0;
>> @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum)
>>   }
>>
>>   static void test_send_signal_common(struct perf_event_attr *attr,
>> -                                   bool signal_thread)
>> +                                   bool signal_thread, bool allow_skip)
>>   {
>>          struct test_send_signal_kern *skel;
>>          int pipe_c2p[2], pipe_p2c[2];
>> @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>          }
>>
>>          if (pid == 0) {
>> +               int old_prio;
>> +
>>                  /* install signal handler and notify parent */
>>                  signal(SIGUSR1, sigusr1_handler);
>>
>>                  close(pipe_c2p[0]); /* close read */
>>                  close(pipe_p2c[1]); /* close write */
>>
>> +               /* boost with a high priority so we got a higher chance
>> +                * that if an interrupt happens, the underlying task
>> +                * is this process.
>> +                */
>> +               errno = 0;
>> +               old_prio = getpriority(PRIO_PROCESS, 0);
>> +               ASSERT_OK(errno, "getpriority");
>> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
>> +
>>                  /* notify parent signal handler is installed */
>>                  ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
>>
>> @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>                  /* wait for parent notification and exit */
>>                  ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
>>
>> +               /* restore the old priority */
>> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
>> +
>>                  close(pipe_c2p[1]);
>>                  close(pipe_p2c[0]);
>>                  exit(0);
>> @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>                  goto disable_pmu;
>>          }
>>
>> -       ASSERT_EQ(buf[0], '2', "incorrect result");
>> -
>>          /* notify child safe to exit */
>>          ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
>>
>> +       if (skel->bss->status == 0 && allow_skip) {
>> +               printf("%s:SKIP\n", __func__);
>> +               test__skip();
>> +       } else if (skel->bss->status != 1) {
>> +               ASSERT_EQ(buf[0], '2', "incorrect result");
>> +       }
>> +
>>   disable_pmu:
>>          close(pmu_fd);
>>   destroy_skel:
>> @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>
>>   static void test_send_signal_tracepoint(bool signal_thread)
>>   {
>> -       test_send_signal_common(NULL, signal_thread);
>> +       test_send_signal_common(NULL, signal_thread, false);
>>   }
>>
>>   static void test_send_signal_perf(bool signal_thread)
>> @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread)
>>                  .config = PERF_COUNT_SW_CPU_CLOCK,
>>          };
>>
>> -       test_send_signal_common(&attr, signal_thread);
>> +       test_send_signal_common(&attr, signal_thread, true);
>>   }
>>
>>   static void test_send_signal_nmi(bool signal_thread)
>> @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread)
>>                  close(pmu_fd);
>>          }
>>
>> -       test_send_signal_common(&attr, signal_thread);
>> +       test_send_signal_common(&attr, signal_thread, true);
>>   }
>>
>>   void test_send_signal(void)
>> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> index b4233d3efac2..59c05c422bbd 100644
>> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx)
>>                          ret = bpf_send_signal_thread(sig);
>>                  else
>>                          ret = bpf_send_signal(sig);
>> -               if (ret == 0)
>> -                       status = 1;
>> +               status = (ret == 0) ? 1 : 2;
> 
> This doesn't make sense to me. status == 0 is the default value, it
> will stay 0 even if nothing is triggered, no BPF program is called,
> etc.

that is true.

> 
> If we are doing the skipping of the test logic (which I'd honestly
> just not do right now to see if we actually fixed the test), then I'd
> set status = 3 for the case when signal was triggered, but the current
> task is not test_progs. And only skip test if we get status 3. That
> is, status 0 and status 2 are bad (either not triggered, or some error
> when sending signal), 1 is OK, 3 is SKIP.

Here, we *assume* bpf program always got called which should be the case 
unless softirq/nmi logic goes wrong. so status = 0 means
pid doesn't match, and status = 1 means good bpf_send_signal happens,
status = 2 means bpf_send_signal helper fails.

> 
> But really, skipping a test that we couldn't randomly run doesn't feel
> good. Can you please leave the priority boosting part and drop the
> skipping part for now?

Sure. Let me drop skipping part. With the patch, I am expecting in 
*most* cases, we should not observe flakiness.

> 
>>          }
>>
>>          return 0;
>> --
>> 2.30.2
>>
Andrii Nakryiko Aug. 17, 2021, 7:06 p.m. UTC | #3
On Tue, Aug 17, 2021 at 12:01 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 8/17/21 11:45 AM, Andrii Nakryiko wrote:
> > On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> libbpf CI has reported send_signal test is flaky although
> >> I am not able to reproduce it in my local environment.
> >> But I am able to reproduce with on-demand libbpf CI ([1]).
> >>
> >> Through code analysis, the following is possible reason.
> >> The failed subtest runs bpf program in softirq environment.
> >> Since bpf_send_signal() only sends to a fork of "test_progs"
> >> process. If the underlying current task is
> >> not "test_progs", bpf_send_signal() will not be triggered
> >> and the subtest will fail.
> >>
> >> To reduce the chances where the underlying process is not
> >> the intended one, this patch boosted scheduling priority to
> >> -20 (highest allowed by setpriority() call). And I did
> >> 10 runs with on-demand libbpf CI with this patch and I
> >> didn't observe any failures.
> >>
> >>   [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   .../selftests/bpf/prog_tests/send_signal.c    | 33 +++++++++++++++----
> >>   .../bpf/progs/test_send_signal_kern.c         |  3 +-
> >>   2 files changed, 28 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> >> index 41e158ae888e..0701c97456da 100644
> >> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
> >> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> >> @@ -1,5 +1,7 @@
> >>   // SPDX-License-Identifier: GPL-2.0
> >>   #include <test_progs.h>
> >> +#include <sys/time.h>
> >> +#include <sys/resource.h>
> >>   #include "test_send_signal_kern.skel.h"
> >>
> >>   int sigusr1_received = 0;
> >> @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum)
> >>   }
> >>
> >>   static void test_send_signal_common(struct perf_event_attr *attr,
> >> -                                   bool signal_thread)
> >> +                                   bool signal_thread, bool allow_skip)
> >>   {
> >>          struct test_send_signal_kern *skel;
> >>          int pipe_c2p[2], pipe_p2c[2];
> >> @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr,
> >>          }
> >>
> >>          if (pid == 0) {
> >> +               int old_prio;
> >> +
> >>                  /* install signal handler and notify parent */
> >>                  signal(SIGUSR1, sigusr1_handler);
> >>
> >>                  close(pipe_c2p[0]); /* close read */
> >>                  close(pipe_p2c[1]); /* close write */
> >>
> >> +               /* boost with a high priority so we got a higher chance
> >> +                * that if an interrupt happens, the underlying task
> >> +                * is this process.
> >> +                */
> >> +               errno = 0;
> >> +               old_prio = getpriority(PRIO_PROCESS, 0);
> >> +               ASSERT_OK(errno, "getpriority");
> >> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
> >> +
> >>                  /* notify parent signal handler is installed */
> >>                  ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
> >>
> >> @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
> >>                  /* wait for parent notification and exit */
> >>                  ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
> >>
> >> +               /* restore the old priority */
> >> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
> >> +
> >>                  close(pipe_c2p[1]);
> >>                  close(pipe_p2c[0]);
> >>                  exit(0);
> >> @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr,
> >>                  goto disable_pmu;
> >>          }
> >>
> >> -       ASSERT_EQ(buf[0], '2', "incorrect result");
> >> -
> >>          /* notify child safe to exit */
> >>          ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
> >>
> >> +       if (skel->bss->status == 0 && allow_skip) {
> >> +               printf("%s:SKIP\n", __func__);
> >> +               test__skip();
> >> +       } else if (skel->bss->status != 1) {
> >> +               ASSERT_EQ(buf[0], '2', "incorrect result");
> >> +       }
> >> +
> >>   disable_pmu:
> >>          close(pmu_fd);
> >>   destroy_skel:
> >> @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
> >>
> >>   static void test_send_signal_tracepoint(bool signal_thread)
> >>   {
> >> -       test_send_signal_common(NULL, signal_thread);
> >> +       test_send_signal_common(NULL, signal_thread, false);
> >>   }
> >>
> >>   static void test_send_signal_perf(bool signal_thread)
> >> @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread)
> >>                  .config = PERF_COUNT_SW_CPU_CLOCK,
> >>          };
> >>
> >> -       test_send_signal_common(&attr, signal_thread);
> >> +       test_send_signal_common(&attr, signal_thread, true);
> >>   }
> >>
> >>   static void test_send_signal_nmi(bool signal_thread)
> >> @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread)
> >>                  close(pmu_fd);
> >>          }
> >>
> >> -       test_send_signal_common(&attr, signal_thread);
> >> +       test_send_signal_common(&attr, signal_thread, true);
> >>   }
> >>
> >>   void test_send_signal(void)
> >> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> >> index b4233d3efac2..59c05c422bbd 100644
> >> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> >> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> >> @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx)
> >>                          ret = bpf_send_signal_thread(sig);
> >>                  else
> >>                          ret = bpf_send_signal(sig);
> >> -               if (ret == 0)
> >> -                       status = 1;
> >> +               status = (ret == 0) ? 1 : 2;
> >
> > This doesn't make sense to me. status == 0 is the default value, it
> > will stay 0 even if nothing is triggered, no BPF program is called,
> > etc.
>
> that is true.
>
> >
> > If we are doing the skipping of the test logic (which I'd honestly
> > just not do right now to see if we actually fixed the test), then I'd
> > set status = 3 for the case when signal was triggered, but the current
> > task is not test_progs. And only skip test if we get status 3. That
> > is, status 0 and status 2 are bad (either not triggered, or some error
> > when sending signal), 1 is OK, 3 is SKIP.
>
> Here, we *assume* bpf program always got called which should be the case
> unless softirq/nmi logic goes wrong. so status = 0 means
> pid doesn't match, and status = 1 means good bpf_send_signal happens,
> status = 2 means bpf_send_signal helper fails.

Sorry, I didn't make my point clear. I meant that test shouldn't just
assume that BPF program ran, so I'd add

if ((bpf_get_current_pid_tgid() >> 32) == pid) {
   ...
} else {
    status = 3;
}

Just to capture that we did get bpf_send_signal_test() called, but we
didn't have correct current.

But it doesn't matter for now, I'd like to see if prio games get us to
stable tests with no skipping first.


>
> >
> > But really, skipping a test that we couldn't randomly run doesn't feel
> > good. Can you please leave the priority boosting part and drop the
> > skipping part for now?
>
> Sure. Let me drop skipping part. With the patch, I am expecting in
> *most* cases, we should not observe flakiness.

Yep, thanks!

>
> >
> >>          }
> >>
> >>          return 0;
> >> --
> >> 2.30.2
> >>
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 41e158ae888e..0701c97456da 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -1,5 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
+#include <sys/time.h>
+#include <sys/resource.h>
 #include "test_send_signal_kern.skel.h"
 
 int sigusr1_received = 0;
@@ -10,7 +12,7 @@  static void sigusr1_handler(int signum)
 }
 
 static void test_send_signal_common(struct perf_event_attr *attr,
-				    bool signal_thread)
+				    bool signal_thread, bool allow_skip)
 {
 	struct test_send_signal_kern *skel;
 	int pipe_c2p[2], pipe_p2c[2];
@@ -37,12 +39,23 @@  static void test_send_signal_common(struct perf_event_attr *attr,
 	}
 
 	if (pid == 0) {
+		int old_prio;
+
 		/* install signal handler and notify parent */
 		signal(SIGUSR1, sigusr1_handler);
 
 		close(pipe_c2p[0]); /* close read */
 		close(pipe_p2c[1]); /* close write */
 
+		/* boost with a high priority so we got a higher chance
+		 * that if an interrupt happens, the underlying task
+		 * is this process.
+		 */
+		errno = 0;
+		old_prio = getpriority(PRIO_PROCESS, 0);
+		ASSERT_OK(errno, "getpriority");
+		ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
+
 		/* notify parent signal handler is installed */
 		ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
 
@@ -58,6 +71,9 @@  static void test_send_signal_common(struct perf_event_attr *attr,
 		/* wait for parent notification and exit */
 		ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
 
+		/* restore the old priority */
+		ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
+
 		close(pipe_c2p[1]);
 		close(pipe_p2c[0]);
 		exit(0);
@@ -110,11 +126,16 @@  static void test_send_signal_common(struct perf_event_attr *attr,
 		goto disable_pmu;
 	}
 
-	ASSERT_EQ(buf[0], '2', "incorrect result");
-
 	/* notify child safe to exit */
 	ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
 
+	if (skel->bss->status == 0 && allow_skip) {
+		printf("%s:SKIP\n", __func__);
+		test__skip();
+	} else if (skel->bss->status != 1) {
+		ASSERT_EQ(buf[0], '2', "incorrect result");
+	}
+
 disable_pmu:
 	close(pmu_fd);
 destroy_skel:
@@ -127,7 +148,7 @@  static void test_send_signal_common(struct perf_event_attr *attr,
 
 static void test_send_signal_tracepoint(bool signal_thread)
 {
-	test_send_signal_common(NULL, signal_thread);
+	test_send_signal_common(NULL, signal_thread, false);
 }
 
 static void test_send_signal_perf(bool signal_thread)
@@ -138,7 +159,7 @@  static void test_send_signal_perf(bool signal_thread)
 		.config = PERF_COUNT_SW_CPU_CLOCK,
 	};
 
-	test_send_signal_common(&attr, signal_thread);
+	test_send_signal_common(&attr, signal_thread, true);
 }
 
 static void test_send_signal_nmi(bool signal_thread)
@@ -167,7 +188,7 @@  static void test_send_signal_nmi(bool signal_thread)
 		close(pmu_fd);
 	}
 
-	test_send_signal_common(&attr, signal_thread);
+	test_send_signal_common(&attr, signal_thread, true);
 }
 
 void test_send_signal(void)
diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
index b4233d3efac2..59c05c422bbd 100644
--- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
@@ -18,8 +18,7 @@  static __always_inline int bpf_send_signal_test(void *ctx)
 			ret = bpf_send_signal_thread(sig);
 		else
 			ret = bpf_send_signal(sig);
-		if (ret == 0)
-			status = 1;
+		status = (ret == 0) ? 1 : 2;
 	}
 
 	return 0;