diff mbox series

[v5,bpf-next,4/4] selftests/bpf: Add selftests for cpumask iter

Message ID 20240131145454.86990-5-laoar.shao@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Add bpf_iter_cpumask | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/build_tools success Errors and warnings before: 1 this patch: 0
netdev/cc_maintainers success CCed 0 of 0 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning CHECK: Macro argument 'meta' may be better as '(meta)' to avoid precedence issues WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: externs should be avoided in .c files WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary

Commit Message

Yafang Shao Jan. 31, 2024, 2:54 p.m. UTC
Add selftests for the newly added cpumask iter.
- cpumask_iter_success
  - The number of CPUs should be expected when iterating over the cpumask
  - percpu data extracted from the percpu struct should be expected
  - It can work in both non-sleepable and sleepable prog
  - RCU lock is only required by bpf_iter_cpumask_new()
  - It is fine without calling bpf_iter_cpumask_next()

- cpumask_iter_failure
  - RCU lock is required in sleepable prog
  - The cpumask to be iterated over can't be NULL
  - bpf_iter_cpumask_destroy() is required after calling
    bpf_iter_cpumask_new()
  - bpf_iter_cpumask_destroy() can only destroy an initilialized iter
  - bpf_iter_cpumask_next() must use an initilialized iter

The result as follows,

  #64/37   cpumask/test_cpumask_iter:OK
  #64/38   cpumask/test_cpumask_iter_sleepable:OK
  #64/39   cpumask/test_cpumask_iter_sleepable:OK
  #64/40   cpumask/test_cpumask_iter_next_no_rcu:OK
  #64/41   cpumask/test_cpumask_iter_no_next:OK
  #64/42   cpumask/test_cpumask_iter:OK
  #64/43   cpumask/test_cpumask_iter_no_rcu:OK
  #64/44   cpumask/test_cpumask_iter_no_destroy:OK
  #64/45   cpumask/test_cpumask_iter_null_pointer:OK
  #64/46   cpumask/test_cpumask_iter_next_uninit:OK
  #64/47   cpumask/test_cpumask_iter_destroy_uninit:OK
  #64      cpumask:OK

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 tools/testing/selftests/bpf/config            |   1 +
 .../selftests/bpf/prog_tests/cpumask.c        | 152 ++++++++++++++++++
 .../selftests/bpf/progs/cpumask_common.h      |   3 +
 .../bpf/progs/cpumask_iter_failure.c          |  99 ++++++++++++
 .../bpf/progs/cpumask_iter_success.c          | 126 +++++++++++++++
 5 files changed, 381 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
 create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_success.c

Comments

Andrii Nakryiko Feb. 2, 2024, 10:02 p.m. UTC | #1
On Wed, Jan 31, 2024 at 6:55 AM Yafang Shao <laoar.shao@gmail.com> wrote:
>
> Add selftests for the newly added cpumask iter.
> - cpumask_iter_success
>   - The number of CPUs should be expected when iterating over the cpumask
>   - percpu data extracted from the percpu struct should be expected
>   - It can work in both non-sleepable and sleepable prog
>   - RCU lock is only required by bpf_iter_cpumask_new()
>   - It is fine without calling bpf_iter_cpumask_next()
>
> - cpumask_iter_failure
>   - RCU lock is required in sleepable prog
>   - The cpumask to be iterated over can't be NULL
>   - bpf_iter_cpumask_destroy() is required after calling
>     bpf_iter_cpumask_new()
>   - bpf_iter_cpumask_destroy() can only destroy an initilialized iter
>   - bpf_iter_cpumask_next() must use an initilialized iter

typos: initialized

>
> The result as follows,
>
>   #64/37   cpumask/test_cpumask_iter:OK
>   #64/38   cpumask/test_cpumask_iter_sleepable:OK
>   #64/39   cpumask/test_cpumask_iter_sleepable:OK
>   #64/40   cpumask/test_cpumask_iter_next_no_rcu:OK
>   #64/41   cpumask/test_cpumask_iter_no_next:OK
>   #64/42   cpumask/test_cpumask_iter:OK
>   #64/43   cpumask/test_cpumask_iter_no_rcu:OK
>   #64/44   cpumask/test_cpumask_iter_no_destroy:OK
>   #64/45   cpumask/test_cpumask_iter_null_pointer:OK
>   #64/46   cpumask/test_cpumask_iter_next_uninit:OK
>   #64/47   cpumask/test_cpumask_iter_destroy_uninit:OK
>   #64      cpumask:OK
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  tools/testing/selftests/bpf/config            |   1 +
>  .../selftests/bpf/prog_tests/cpumask.c        | 152 ++++++++++++++++++
>  .../selftests/bpf/progs/cpumask_common.h      |   3 +
>  .../bpf/progs/cpumask_iter_failure.c          |  99 ++++++++++++
>  .../bpf/progs/cpumask_iter_success.c          | 126 +++++++++++++++
>  5 files changed, 381 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
>  create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_success.c
>

LGTM overall, except for seemingly unnecessary use of a big macro

> diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
> index 0cd4aebb97cf..cdb9dc95e9d9 100644
> --- a/tools/testing/selftests/bpf/progs/cpumask_common.h
> +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
> @@ -55,6 +55,9 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
>  u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
>  u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
>  u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
> +int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask) __ksym;
> +int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
> +void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;

let's mark them __weak so they don't conflict with definitions that
will eventually come from vmlinux.h (that applies to all the kfunc
definitions we currently have and we'll need to clean all that up, but
let's not add non-weak kfuncs going forward)

>
>  void bpf_rcu_read_lock(void) __ksym;
>  void bpf_rcu_read_unlock(void) __ksym;

[...]

> diff --git a/tools/testing/selftests/bpf/progs/cpumask_iter_success.c b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> new file mode 100644
> index 000000000000..4ce14ef98451
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> @@ -0,0 +1,126 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
> +
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +#include "task_kfunc_common.h"
> +#include "cpumask_common.h"
> +
> +char _license[] SEC("license") = "GPL";
> +
> +extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
> +extern const struct rq runqueues __ksym __weak;
> +
> +int pid;
> +
> +#define READ_PERCPU_DATA(meta, cgrp, mask)                                                     \
> +{                                                                                              \
> +       u32 nr_running = 0, psi_nr_running = 0, nr_cpus = 0;                                    \
> +       struct psi_group_cpu *groupc;                                                           \
> +       struct rq *rq;                                                                          \
> +       int *cpu;                                                                               \
> +                                                                                               \
> +       bpf_for_each(cpumask, cpu, mask) {                                                      \
> +               rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);                            \
> +               if (!rq) {                                                                      \
> +                       err += 1;                                                               \
> +                       continue;                                                               \
> +               }                                                                               \
> +               nr_running += rq->nr_running;                                                   \
> +               nr_cpus += 1;                                                                   \
> +                                                                                               \
> +               groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);     \
> +               if (!groupc) {                                                                  \
> +                       err += 1;                                                               \
> +                       continue;                                                               \
> +               }                                                                               \
> +               psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
> +       }                                                                                       \
> +       BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",                  \
> +                      nr_running, nr_cpus, psi_nr_running);                                    \
> +}
> +

Does this have to be a gigantic macro? Why can't it be just a function?

> +SEC("iter.s/cgroup")
> +int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta *meta, struct cgroup *cgrp)
> +{
> +       struct task_struct *p;
> +
> +       /* epilogue */
> +       if (!cgrp)
> +               return 0;
> +
> +       bpf_rcu_read_lock();
> +       p = bpf_task_from_pid(pid);
> +       if (!p) {
> +               bpf_rcu_read_unlock();
> +               return 1;
> +       }
> +
> +       READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
> +       bpf_task_release(p);
> +       bpf_rcu_read_unlock();
> +       return 0;
> +}
> +

[...]
Yafang Shao Feb. 4, 2024, 3:30 a.m. UTC | #2
On Sat, Feb 3, 2024 at 6:03 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Wed, Jan 31, 2024 at 6:55 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> >
> > Add selftests for the newly added cpumask iter.
> > - cpumask_iter_success
> >   - The number of CPUs should be expected when iterating over the cpumask
> >   - percpu data extracted from the percpu struct should be expected
> >   - It can work in both non-sleepable and sleepable prog
> >   - RCU lock is only required by bpf_iter_cpumask_new()
> >   - It is fine without calling bpf_iter_cpumask_next()
> >
> > - cpumask_iter_failure
> >   - RCU lock is required in sleepable prog
> >   - The cpumask to be iterated over can't be NULL
> >   - bpf_iter_cpumask_destroy() is required after calling
> >     bpf_iter_cpumask_new()
> >   - bpf_iter_cpumask_destroy() can only destroy an initilialized iter
> >   - bpf_iter_cpumask_next() must use an initilialized iter
>
> typos: initialized

will fix it.

>
> >
> > The result as follows,
> >
> >   #64/37   cpumask/test_cpumask_iter:OK
> >   #64/38   cpumask/test_cpumask_iter_sleepable:OK
> >   #64/39   cpumask/test_cpumask_iter_sleepable:OK
> >   #64/40   cpumask/test_cpumask_iter_next_no_rcu:OK
> >   #64/41   cpumask/test_cpumask_iter_no_next:OK
> >   #64/42   cpumask/test_cpumask_iter:OK
> >   #64/43   cpumask/test_cpumask_iter_no_rcu:OK
> >   #64/44   cpumask/test_cpumask_iter_no_destroy:OK
> >   #64/45   cpumask/test_cpumask_iter_null_pointer:OK
> >   #64/46   cpumask/test_cpumask_iter_next_uninit:OK
> >   #64/47   cpumask/test_cpumask_iter_destroy_uninit:OK
> >   #64      cpumask:OK
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> >  tools/testing/selftests/bpf/config            |   1 +
> >  .../selftests/bpf/prog_tests/cpumask.c        | 152 ++++++++++++++++++
> >  .../selftests/bpf/progs/cpumask_common.h      |   3 +
> >  .../bpf/progs/cpumask_iter_failure.c          |  99 ++++++++++++
> >  .../bpf/progs/cpumask_iter_success.c          | 126 +++++++++++++++
> >  5 files changed, 381 insertions(+)
> >  create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
> >  create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> >
>
> LGTM overall, except for seemingly unnecessary use of a big macro
>
> > diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
> > index 0cd4aebb97cf..cdb9dc95e9d9 100644
> > --- a/tools/testing/selftests/bpf/progs/cpumask_common.h
> > +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
> > @@ -55,6 +55,9 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
> >  u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
> >  u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
> >  u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
> > +int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask) __ksym;
> > +int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
> > +void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;
>
> let's mark them __weak so they don't conflict with definitions that
> will eventually come from vmlinux.h (that applies to all the kfunc
> definitions we currently have and we'll need to clean all that up, but
> let's not add non-weak kfuncs going forward)

will change it.

>
> >
> >  void bpf_rcu_read_lock(void) __ksym;
> >  void bpf_rcu_read_unlock(void) __ksym;
>
> [...]
>
> > diff --git a/tools/testing/selftests/bpf/progs/cpumask_iter_success.c b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> > new file mode 100644
> > index 000000000000..4ce14ef98451
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> > @@ -0,0 +1,126 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
> > +
> > +#include "vmlinux.h"
> > +#include <bpf/bpf_helpers.h>
> > +#include <bpf/bpf_tracing.h>
> > +
> > +#include "task_kfunc_common.h"
> > +#include "cpumask_common.h"
> > +
> > +char _license[] SEC("license") = "GPL";
> > +
> > +extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
> > +extern const struct rq runqueues __ksym __weak;
> > +
> > +int pid;
> > +
> > +#define READ_PERCPU_DATA(meta, cgrp, mask)                                                     \
> > +{                                                                                              \
> > +       u32 nr_running = 0, psi_nr_running = 0, nr_cpus = 0;                                    \
> > +       struct psi_group_cpu *groupc;                                                           \
> > +       struct rq *rq;                                                                          \
> > +       int *cpu;                                                                               \
> > +                                                                                               \
> > +       bpf_for_each(cpumask, cpu, mask) {                                                      \
> > +               rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);                            \
> > +               if (!rq) {                                                                      \
> > +                       err += 1;                                                               \
> > +                       continue;                                                               \
> > +               }                                                                               \
> > +               nr_running += rq->nr_running;                                                   \
> > +               nr_cpus += 1;                                                                   \
> > +                                                                                               \
> > +               groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);     \
> > +               if (!groupc) {                                                                  \
> > +                       err += 1;                                                               \
> > +                       continue;                                                               \
> > +               }                                                                               \
> > +               psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
> > +       }                                                                                       \
> > +       BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",                  \
> > +                      nr_running, nr_cpus, psi_nr_running);                                    \
> > +}
> > +
>
> Does this have to be a gigantic macro? Why can't it be just a function?

It seems that the verifier can't identify a function call between
bpf_rcu_read_lock() and bpf_rcu_read_unlock().
That said, if there's a function call between them, the verifier will fail.
Below is the full verifier log if I define it as :
static inline void read_percpu_data(struct bpf_iter_meta *meta, struct
cgroup *cgrp, const cpumask_t *mask)

VERIFIER LOG:
=============
0: R1=ctx() R10=fp0
; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
*meta, struct cgroup *cgrp)
0: (b4) w7 = 0                        ; R7_w=0
; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
*meta, struct cgroup *cgrp)
1: (79) r2 = *(u64 *)(r1 +8)          ; R1=ctx()
R2_w=trusted_ptr_or_null_cgroup(id=1)
; if (!cgrp)
2: (15) if r2 == 0x0 goto pc+16       ; R2_w=trusted_ptr_cgroup()
; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
*meta, struct cgroup *cgrp)
3: (79) r6 = *(u64 *)(r1 +0)
func 'bpf_iter_cgroup' arg0 has btf_id 10966 type STRUCT 'bpf_iter_meta'
4: R1=ctx() R6_w=trusted_ptr_bpf_iter_meta()
; bpf_rcu_read_lock();
4: (85) call bpf_rcu_read_lock#84184          ;
; p = bpf_task_from_pid(pid);
5: (18) r1 = 0xffffbc1ad3f72004       ;
R1_w=map_value(map=cpumask_.bss,ks=4,vs=8,off=4)
7: (61) r1 = *(u32 *)(r1 +0)          ;
R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
; p = bpf_task_from_pid(pid);
8: (85) call bpf_task_from_pid#84204          ;
R0=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
9: (bf) r8 = r0                       ;
R0=ptr_or_null_task_struct(id=3,ref_obj_id=3)
R8_w=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
10: (b4) w7 = 1                       ; R7_w=1 refs=3
; if (!p) {
11: (15) if r8 == 0x0 goto pc+6       ;
R8_w=ptr_task_struct(ref_obj_id=3) refs=3
; read_percpu_data(meta, cgrp, p->cpus_ptr);
12: (79) r2 = *(u64 *)(r8 +984)       ; R2_w=rcu_ptr_cpumask()
R8_w=ptr_task_struct(ref_obj_id=3) refs=3
; read_percpu_data(meta, cgrp, p->cpus_ptr);
13: (bf) r1 = r6                      ;
R1_w=trusted_ptr_bpf_iter_meta() R6=trusted_ptr_bpf_iter_meta() refs=3
14: (85) call pc+6
caller:
 R6=trusted_ptr_bpf_iter_meta() R7_w=1
R8_w=ptr_task_struct(ref_obj_id=3) R10=fp0 refs=3
callee:
 frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask() R10=fp0 refs=3
21: frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask()
R10=fp0 refs=3
; static inline void read_percpu_data(struct bpf_iter_meta *meta,
struct cgroup *cgrp, const cpumask_t *mask)
21: (bf) r8 = r1                      ; frame1:
R1_w=trusted_ptr_bpf_iter_meta() R8_w=trusted_ptr_bpf_iter_meta()
refs=3
22: (bf) r7 = r10                     ; frame1: R7_w=fp0 R10=fp0 refs=3
;
23: (07) r7 += -24                    ; frame1: R7_w=fp-24 refs=3
; bpf_for_each(cpumask, cpu, mask) {
24: (bf) r1 = r7                      ; frame1: R1_w=fp-24 R7_w=fp-24 refs=3
25: (85) call bpf_iter_cpumask_new#77163      ; frame1: R0=scalar()
fp-24=iter_cpumask(ref_id=4,state=active,depth=0) refs=3,4
; bpf_for_each(cpumask, cpu, mask) {
26: (bf) r1 = r7                      ; frame1: R1=fp-24 R7=fp-24 refs=3,4
27: (85) call bpf_iter_cpumask_next#77165     ; frame1: R0_w=0
fp-24=iter_cpumask(ref_id=4,state=drained,depth=0) refs=3,4
28: (bf) r7 = r0                      ; frame1: R0_w=0 R7_w=0 refs=3,4
29: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
30: (63) *(u32 *)(r10 -40) = r1       ; frame1: R1_w=0 R10=fp0
fp-40=????0 refs=3,4
31: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
32: (7b) *(u64 *)(r10 -32) = r1       ; frame1: R1_w=0 R10=fp0
fp-32_w=0 refs=3,4
33: (b4) w9 = 0                       ; frame1: R9_w=0 refs=3,4
; bpf_for_each(cpumask, cpu, mask) {
34: (15) if r7 == 0x0 goto pc+57      ; frame1: R7_w=0 refs=3,4
; bpf_for_each(cpumask, cpu, mask) {
92: (bf) r1 = r10                     ; frame1: R1_w=fp0 R10=fp0 refs=3,4
; bpf_for_each(cpumask, cpu, mask) {
93: (07) r1 += -24                    ; frame1: R1_w=fp-24 refs=3,4
94: (85) call bpf_iter_cpumask_destroy#77161          ; frame1: refs=3
; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
95: (61) r1 = *(u32 *)(r10 -40)       ; frame1: R1_w=0 R10=fp0
fp-40=????0 refs=3
96: (bc) w1 = w1                      ; frame1: R1_w=0 refs=3
97: (7b) *(u64 *)(r10 -8) = r1        ; frame1: R1_w=0 R10=fp0 fp-8_w=0 refs=3
98: (79) r1 = *(u64 *)(r10 -32)       ; frame1: R1_w=0 R10=fp0 fp-32=0 refs=3
99: (7b) *(u64 *)(r10 -16) = r1       ; frame1: R1_w=0 R10=fp0 fp-16_w=0 refs=3
100: (7b) *(u64 *)(r10 -24) = r9      ; frame1: R9=0 R10=fp0 fp-24_w=0 refs=3
101: (79) r1 = *(u64 *)(r8 +0)        ; frame1:
R1_w=trusted_ptr_seq_file() R8=trusted_ptr_bpf_iter_meta() refs=3
102: (bf) r4 = r10                    ; frame1: R4_w=fp0 R10=fp0 refs=3
; bpf_for_each(cpumask, cpu, mask) {
103: (07) r4 += -24                   ; frame1: R4_w=fp-24 refs=3
; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
104: (18) r2 = 0xffff9bce47e0e210     ; frame1:
R2_w=map_value(map=cpumask_.rodata,ks=4,vs=41) refs=3
106: (b4) w3 = 41                     ; frame1: R3_w=41 refs=3
107: (b4) w5 = 24                     ; frame1: R5_w=24 refs=3
108: (85) call bpf_seq_printf#126     ; frame1: R0=scalar() refs=3
; }
109: (95) exit
bpf_rcu_read_unlock is missing
processed 45 insns (limit 1000000) max_states_per_insn 0 total_states
5 peak_states 5 mark_read 3
=============


Another workaround is using the __always_inline :
static __always_inline void read_percpu_data(struct bpf_iter_meta
*meta, struct cgroup *cgrp, const cpumask_t *mask)

>
> > +SEC("iter.s/cgroup")
> > +int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta *meta, struct cgroup *cgrp)
> > +{
> > +       struct task_struct *p;
> > +
> > +       /* epilogue */
> > +       if (!cgrp)
> > +               return 0;
> > +
> > +       bpf_rcu_read_lock();
> > +       p = bpf_task_from_pid(pid);
> > +       if (!p) {
> > +               bpf_rcu_read_unlock();
> > +               return 1;
> > +       }
> > +
> > +       READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
> > +       bpf_task_release(p);
> > +       bpf_rcu_read_unlock();
> > +       return 0;
> > +}
> > +
>
> [...]
Yonghong Song Feb. 4, 2024, 5:09 p.m. UTC | #3
On 2/3/24 7:30 PM, Yafang Shao wrote:
> On Sat, Feb 3, 2024 at 6:03 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
>> On Wed, Jan 31, 2024 at 6:55 AM Yafang Shao <laoar.shao@gmail.com> wrote:
>>> Add selftests for the newly added cpumask iter.
>>> - cpumask_iter_success
>>>    - The number of CPUs should be expected when iterating over the cpumask
>>>    - percpu data extracted from the percpu struct should be expected
>>>    - It can work in both non-sleepable and sleepable prog
>>>    - RCU lock is only required by bpf_iter_cpumask_new()
>>>    - It is fine without calling bpf_iter_cpumask_next()
>>>
>>> - cpumask_iter_failure
>>>    - RCU lock is required in sleepable prog
>>>    - The cpumask to be iterated over can't be NULL
>>>    - bpf_iter_cpumask_destroy() is required after calling
>>>      bpf_iter_cpumask_new()
>>>    - bpf_iter_cpumask_destroy() can only destroy an initilialized iter
>>>    - bpf_iter_cpumask_next() must use an initilialized iter
>> typos: initialized
> will fix it.
>
>>> The result as follows,
>>>
>>>    #64/37   cpumask/test_cpumask_iter:OK
>>>    #64/38   cpumask/test_cpumask_iter_sleepable:OK
>>>    #64/39   cpumask/test_cpumask_iter_sleepable:OK
>>>    #64/40   cpumask/test_cpumask_iter_next_no_rcu:OK
>>>    #64/41   cpumask/test_cpumask_iter_no_next:OK
>>>    #64/42   cpumask/test_cpumask_iter:OK
>>>    #64/43   cpumask/test_cpumask_iter_no_rcu:OK
>>>    #64/44   cpumask/test_cpumask_iter_no_destroy:OK
>>>    #64/45   cpumask/test_cpumask_iter_null_pointer:OK
>>>    #64/46   cpumask/test_cpumask_iter_next_uninit:OK
>>>    #64/47   cpumask/test_cpumask_iter_destroy_uninit:OK
>>>    #64      cpumask:OK
>>>
>>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
>>> ---
>>>   tools/testing/selftests/bpf/config            |   1 +
>>>   .../selftests/bpf/prog_tests/cpumask.c        | 152 ++++++++++++++++++
>>>   .../selftests/bpf/progs/cpumask_common.h      |   3 +
>>>   .../bpf/progs/cpumask_iter_failure.c          |  99 ++++++++++++
>>>   .../bpf/progs/cpumask_iter_success.c          | 126 +++++++++++++++
>>>   5 files changed, 381 insertions(+)
>>>   create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
>>>   create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_success.c
>>>
>> LGTM overall, except for seemingly unnecessary use of a big macro
>>
>>> diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
>>> index 0cd4aebb97cf..cdb9dc95e9d9 100644
>>> --- a/tools/testing/selftests/bpf/progs/cpumask_common.h
>>> +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
>>> @@ -55,6 +55,9 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
>>>   u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
>>>   u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
>>>   u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
>>> +int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask) __ksym;
>>> +int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
>>> +void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;
>> let's mark them __weak so they don't conflict with definitions that
>> will eventually come from vmlinux.h (that applies to all the kfunc
>> definitions we currently have and we'll need to clean all that up, but
>> let's not add non-weak kfuncs going forward)
> will change it.
>
>>>   void bpf_rcu_read_lock(void) __ksym;
>>>   void bpf_rcu_read_unlock(void) __ksym;
>> [...]
>>
>>> diff --git a/tools/testing/selftests/bpf/progs/cpumask_iter_success.c b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
>>> new file mode 100644
>>> index 000000000000..4ce14ef98451
>>> --- /dev/null
>>> +++ b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
>>> @@ -0,0 +1,126 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
>>> +
>>> +#include "vmlinux.h"
>>> +#include <bpf/bpf_helpers.h>
>>> +#include <bpf/bpf_tracing.h>
>>> +
>>> +#include "task_kfunc_common.h"
>>> +#include "cpumask_common.h"
>>> +
>>> +char _license[] SEC("license") = "GPL";
>>> +
>>> +extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
>>> +extern const struct rq runqueues __ksym __weak;
>>> +
>>> +int pid;
>>> +
>>> +#define READ_PERCPU_DATA(meta, cgrp, mask)                                                     \
>>> +{                                                                                              \
>>> +       u32 nr_running = 0, psi_nr_running = 0, nr_cpus = 0;                                    \
>>> +       struct psi_group_cpu *groupc;                                                           \
>>> +       struct rq *rq;                                                                          \
>>> +       int *cpu;                                                                               \
>>> +                                                                                               \
>>> +       bpf_for_each(cpumask, cpu, mask) {                                                      \
>>> +               rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);                            \
>>> +               if (!rq) {                                                                      \
>>> +                       err += 1;                                                               \
>>> +                       continue;                                                               \
>>> +               }                                                                               \
>>> +               nr_running += rq->nr_running;                                                   \
>>> +               nr_cpus += 1;                                                                   \
>>> +                                                                                               \
>>> +               groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);     \
>>> +               if (!groupc) {                                                                  \
>>> +                       err += 1;                                                               \
>>> +                       continue;                                                               \
>>> +               }                                                                               \
>>> +               psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
>>> +       }                                                                                       \
>>> +       BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",                  \
>>> +                      nr_running, nr_cpus, psi_nr_running);                                    \
>>> +}
>>> +
>> Does this have to be a gigantic macro? Why can't it be just a function?
> It seems that the verifier can't identify a function call between
> bpf_rcu_read_lock() and bpf_rcu_read_unlock().
> That said, if there's a function call between them, the verifier will fail.
> Below is the full verifier log if I define it as :
> static inline void read_percpu_data(struct bpf_iter_meta *meta, struct
> cgroup *cgrp, const cpumask_t *mask)
>
> VERIFIER LOG:
> =============
> 0: R1=ctx() R10=fp0
> ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> *meta, struct cgroup *cgrp)
> 0: (b4) w7 = 0                        ; R7_w=0
> ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> *meta, struct cgroup *cgrp)
> 1: (79) r2 = *(u64 *)(r1 +8)          ; R1=ctx()
> R2_w=trusted_ptr_or_null_cgroup(id=1)
> ; if (!cgrp)
> 2: (15) if r2 == 0x0 goto pc+16       ; R2_w=trusted_ptr_cgroup()
> ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> *meta, struct cgroup *cgrp)
> 3: (79) r6 = *(u64 *)(r1 +0)
> func 'bpf_iter_cgroup' arg0 has btf_id 10966 type STRUCT 'bpf_iter_meta'
> 4: R1=ctx() R6_w=trusted_ptr_bpf_iter_meta()
> ; bpf_rcu_read_lock();
> 4: (85) call bpf_rcu_read_lock#84184          ;
> ; p = bpf_task_from_pid(pid);
> 5: (18) r1 = 0xffffbc1ad3f72004       ;
> R1_w=map_value(map=cpumask_.bss,ks=4,vs=8,off=4)
> 7: (61) r1 = *(u32 *)(r1 +0)          ;
> R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
> ; p = bpf_task_from_pid(pid);
> 8: (85) call bpf_task_from_pid#84204          ;
> R0=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
> 9: (bf) r8 = r0                       ;
> R0=ptr_or_null_task_struct(id=3,ref_obj_id=3)
> R8_w=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
> 10: (b4) w7 = 1                       ; R7_w=1 refs=3
> ; if (!p) {
> 11: (15) if r8 == 0x0 goto pc+6       ;
> R8_w=ptr_task_struct(ref_obj_id=3) refs=3
> ; read_percpu_data(meta, cgrp, p->cpus_ptr);
> 12: (79) r2 = *(u64 *)(r8 +984)       ; R2_w=rcu_ptr_cpumask()
> R8_w=ptr_task_struct(ref_obj_id=3) refs=3
> ; read_percpu_data(meta, cgrp, p->cpus_ptr);
> 13: (bf) r1 = r6                      ;
> R1_w=trusted_ptr_bpf_iter_meta() R6=trusted_ptr_bpf_iter_meta() refs=3
> 14: (85) call pc+6
> caller:
>   R6=trusted_ptr_bpf_iter_meta() R7_w=1
> R8_w=ptr_task_struct(ref_obj_id=3) R10=fp0 refs=3
> callee:
>   frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask() R10=fp0 refs=3
> 21: frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask()
> R10=fp0 refs=3
> ; static inline void read_percpu_data(struct bpf_iter_meta *meta,
> struct cgroup *cgrp, const cpumask_t *mask)
> 21: (bf) r8 = r1                      ; frame1:
> R1_w=trusted_ptr_bpf_iter_meta() R8_w=trusted_ptr_bpf_iter_meta()
> refs=3
> 22: (bf) r7 = r10                     ; frame1: R7_w=fp0 R10=fp0 refs=3
> ;
> 23: (07) r7 += -24                    ; frame1: R7_w=fp-24 refs=3
> ; bpf_for_each(cpumask, cpu, mask) {
> 24: (bf) r1 = r7                      ; frame1: R1_w=fp-24 R7_w=fp-24 refs=3
> 25: (85) call bpf_iter_cpumask_new#77163      ; frame1: R0=scalar()
> fp-24=iter_cpumask(ref_id=4,state=active,depth=0) refs=3,4
> ; bpf_for_each(cpumask, cpu, mask) {
> 26: (bf) r1 = r7                      ; frame1: R1=fp-24 R7=fp-24 refs=3,4
> 27: (85) call bpf_iter_cpumask_next#77165     ; frame1: R0_w=0
> fp-24=iter_cpumask(ref_id=4,state=drained,depth=0) refs=3,4
> 28: (bf) r7 = r0                      ; frame1: R0_w=0 R7_w=0 refs=3,4
> 29: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
> 30: (63) *(u32 *)(r10 -40) = r1       ; frame1: R1_w=0 R10=fp0
> fp-40=????0 refs=3,4
> 31: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
> 32: (7b) *(u64 *)(r10 -32) = r1       ; frame1: R1_w=0 R10=fp0
> fp-32_w=0 refs=3,4
> 33: (b4) w9 = 0                       ; frame1: R9_w=0 refs=3,4
> ; bpf_for_each(cpumask, cpu, mask) {
> 34: (15) if r7 == 0x0 goto pc+57      ; frame1: R7_w=0 refs=3,4
> ; bpf_for_each(cpumask, cpu, mask) {
> 92: (bf) r1 = r10                     ; frame1: R1_w=fp0 R10=fp0 refs=3,4
> ; bpf_for_each(cpumask, cpu, mask) {
> 93: (07) r1 += -24                    ; frame1: R1_w=fp-24 refs=3,4
> 94: (85) call bpf_iter_cpumask_destroy#77161          ; frame1: refs=3
> ; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
> 95: (61) r1 = *(u32 *)(r10 -40)       ; frame1: R1_w=0 R10=fp0
> fp-40=????0 refs=3
> 96: (bc) w1 = w1                      ; frame1: R1_w=0 refs=3
> 97: (7b) *(u64 *)(r10 -8) = r1        ; frame1: R1_w=0 R10=fp0 fp-8_w=0 refs=3
> 98: (79) r1 = *(u64 *)(r10 -32)       ; frame1: R1_w=0 R10=fp0 fp-32=0 refs=3
> 99: (7b) *(u64 *)(r10 -16) = r1       ; frame1: R1_w=0 R10=fp0 fp-16_w=0 refs=3
> 100: (7b) *(u64 *)(r10 -24) = r9      ; frame1: R9=0 R10=fp0 fp-24_w=0 refs=3
> 101: (79) r1 = *(u64 *)(r8 +0)        ; frame1:
> R1_w=trusted_ptr_seq_file() R8=trusted_ptr_bpf_iter_meta() refs=3
> 102: (bf) r4 = r10                    ; frame1: R4_w=fp0 R10=fp0 refs=3
> ; bpf_for_each(cpumask, cpu, mask) {
> 103: (07) r4 += -24                   ; frame1: R4_w=fp-24 refs=3
> ; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
> 104: (18) r2 = 0xffff9bce47e0e210     ; frame1:
> R2_w=map_value(map=cpumask_.rodata,ks=4,vs=41) refs=3
> 106: (b4) w3 = 41                     ; frame1: R3_w=41 refs=3
> 107: (b4) w5 = 24                     ; frame1: R5_w=24 refs=3
> 108: (85) call bpf_seq_printf#126     ; frame1: R0=scalar() refs=3
> ; }
> 109: (95) exit
> bpf_rcu_read_unlock is missing
> processed 45 insns (limit 1000000) max_states_per_insn 0 total_states
> 5 peak_states 5 mark_read 3
> =============

The error is due to the following in verifier:

                         } else if (opcode == BPF_EXIT) {
				...
                                 if (env->cur_state->active_rcu_lock &&
                                     !in_rbtree_lock_required_cb(env)) {
                                         verbose(env, "bpf_rcu_read_unlock is missing\n");
                                         return -EINVAL;
                                 }


I guess, we could relax the condition not to return -EINVAL if
it is a static function.

>
>
> Another workaround is using the __always_inline :
> static __always_inline void read_percpu_data(struct bpf_iter_meta
> *meta, struct cgroup *cgrp, const cpumask_t *mask)

__always_inline is also work. But let us improve verifier so we
can avoid such workarounds in the future. Note that Kumar just
submitted a patch set to relax spin_lock for static functions:
   https://lore.kernel.org/bpf/20240204120206.796412-1-memxor@gmail.com/

>
>>> +SEC("iter.s/cgroup")
>>> +int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta *meta, struct cgroup *cgrp)
>>> +{
>>> +       struct task_struct *p;
>>> +
>>> +       /* epilogue */
>>> +       if (!cgrp)
>>> +               return 0;
>>> +
>>> +       bpf_rcu_read_lock();
>>> +       p = bpf_task_from_pid(pid);
>>> +       if (!p) {
>>> +               bpf_rcu_read_unlock();
>>> +               return 1;
>>> +       }
>>> +
>>> +       READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
>>> +       bpf_task_release(p);
>>> +       bpf_rcu_read_unlock();
>>> +       return 0;
>>> +}
>>> +
>> [...]
>
>
Yafang Shao Feb. 5, 2024, 2:49 a.m. UTC | #4
On Mon, Feb 5, 2024 at 1:09 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 2/3/24 7:30 PM, Yafang Shao wrote:
> > On Sat, Feb 3, 2024 at 6:03 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> >> On Wed, Jan 31, 2024 at 6:55 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> >>> Add selftests for the newly added cpumask iter.
> >>> - cpumask_iter_success
> >>>    - The number of CPUs should be expected when iterating over the cpumask
> >>>    - percpu data extracted from the percpu struct should be expected
> >>>    - It can work in both non-sleepable and sleepable prog
> >>>    - RCU lock is only required by bpf_iter_cpumask_new()
> >>>    - It is fine without calling bpf_iter_cpumask_next()
> >>>
> >>> - cpumask_iter_failure
> >>>    - RCU lock is required in sleepable prog
> >>>    - The cpumask to be iterated over can't be NULL
> >>>    - bpf_iter_cpumask_destroy() is required after calling
> >>>      bpf_iter_cpumask_new()
> >>>    - bpf_iter_cpumask_destroy() can only destroy an initilialized iter
> >>>    - bpf_iter_cpumask_next() must use an initilialized iter
> >> typos: initialized
> > will fix it.
> >
> >>> The result as follows,
> >>>
> >>>    #64/37   cpumask/test_cpumask_iter:OK
> >>>    #64/38   cpumask/test_cpumask_iter_sleepable:OK
> >>>    #64/39   cpumask/test_cpumask_iter_sleepable:OK
> >>>    #64/40   cpumask/test_cpumask_iter_next_no_rcu:OK
> >>>    #64/41   cpumask/test_cpumask_iter_no_next:OK
> >>>    #64/42   cpumask/test_cpumask_iter:OK
> >>>    #64/43   cpumask/test_cpumask_iter_no_rcu:OK
> >>>    #64/44   cpumask/test_cpumask_iter_no_destroy:OK
> >>>    #64/45   cpumask/test_cpumask_iter_null_pointer:OK
> >>>    #64/46   cpumask/test_cpumask_iter_next_uninit:OK
> >>>    #64/47   cpumask/test_cpumask_iter_destroy_uninit:OK
> >>>    #64      cpumask:OK
> >>>
> >>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> >>> ---
> >>>   tools/testing/selftests/bpf/config            |   1 +
> >>>   .../selftests/bpf/prog_tests/cpumask.c        | 152 ++++++++++++++++++
> >>>   .../selftests/bpf/progs/cpumask_common.h      |   3 +
> >>>   .../bpf/progs/cpumask_iter_failure.c          |  99 ++++++++++++
> >>>   .../bpf/progs/cpumask_iter_success.c          | 126 +++++++++++++++
> >>>   5 files changed, 381 insertions(+)
> >>>   create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
> >>>   create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> >>>
> >> LGTM overall, except for seemingly unnecessary use of a big macro
> >>
> >>> diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
> >>> index 0cd4aebb97cf..cdb9dc95e9d9 100644
> >>> --- a/tools/testing/selftests/bpf/progs/cpumask_common.h
> >>> +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
> >>> @@ -55,6 +55,9 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
> >>>   u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
> >>>   u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
> >>>   u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
> >>> +int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask) __ksym;
> >>> +int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
> >>> +void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;
> >> let's mark them __weak so they don't conflict with definitions that
> >> will eventually come from vmlinux.h (that applies to all the kfunc
> >> definitions we currently have and we'll need to clean all that up, but
> >> let's not add non-weak kfuncs going forward)
> > will change it.
> >
> >>>   void bpf_rcu_read_lock(void) __ksym;
> >>>   void bpf_rcu_read_unlock(void) __ksym;
> >> [...]
> >>
> >>> diff --git a/tools/testing/selftests/bpf/progs/cpumask_iter_success.c b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> >>> new file mode 100644
> >>> index 000000000000..4ce14ef98451
> >>> --- /dev/null
> >>> +++ b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> >>> @@ -0,0 +1,126 @@
> >>> +// SPDX-License-Identifier: GPL-2.0-only
> >>> +/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
> >>> +
> >>> +#include "vmlinux.h"
> >>> +#include <bpf/bpf_helpers.h>
> >>> +#include <bpf/bpf_tracing.h>
> >>> +
> >>> +#include "task_kfunc_common.h"
> >>> +#include "cpumask_common.h"
> >>> +
> >>> +char _license[] SEC("license") = "GPL";
> >>> +
> >>> +extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
> >>> +extern const struct rq runqueues __ksym __weak;
> >>> +
> >>> +int pid;
> >>> +
> >>> +#define READ_PERCPU_DATA(meta, cgrp, mask)                                                     \
> >>> +{                                                                                              \
> >>> +       u32 nr_running = 0, psi_nr_running = 0, nr_cpus = 0;                                    \
> >>> +       struct psi_group_cpu *groupc;                                                           \
> >>> +       struct rq *rq;                                                                          \
> >>> +       int *cpu;                                                                               \
> >>> +                                                                                               \
> >>> +       bpf_for_each(cpumask, cpu, mask) {                                                      \
> >>> +               rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);                            \
> >>> +               if (!rq) {                                                                      \
> >>> +                       err += 1;                                                               \
> >>> +                       continue;                                                               \
> >>> +               }                                                                               \
> >>> +               nr_running += rq->nr_running;                                                   \
> >>> +               nr_cpus += 1;                                                                   \
> >>> +                                                                                               \
> >>> +               groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);     \
> >>> +               if (!groupc) {                                                                  \
> >>> +                       err += 1;                                                               \
> >>> +                       continue;                                                               \
> >>> +               }                                                                               \
> >>> +               psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
> >>> +       }                                                                                       \
> >>> +       BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",                  \
> >>> +                      nr_running, nr_cpus, psi_nr_running);                                    \
> >>> +}
> >>> +
> >> Does this have to be a gigantic macro? Why can't it be just a function?
> > It seems that the verifier can't identify a function call between
> > bpf_rcu_read_lock() and bpf_rcu_read_unlock().
> > That said, if there's a function call between them, the verifier will fail.
> > Below is the full verifier log if I define it as :
> > static inline void read_percpu_data(struct bpf_iter_meta *meta, struct
> > cgroup *cgrp, const cpumask_t *mask)
> >
> > VERIFIER LOG:
> > =============
> > 0: R1=ctx() R10=fp0
> > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > *meta, struct cgroup *cgrp)
> > 0: (b4) w7 = 0                        ; R7_w=0
> > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > *meta, struct cgroup *cgrp)
> > 1: (79) r2 = *(u64 *)(r1 +8)          ; R1=ctx()
> > R2_w=trusted_ptr_or_null_cgroup(id=1)
> > ; if (!cgrp)
> > 2: (15) if r2 == 0x0 goto pc+16       ; R2_w=trusted_ptr_cgroup()
> > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > *meta, struct cgroup *cgrp)
> > 3: (79) r6 = *(u64 *)(r1 +0)
> > func 'bpf_iter_cgroup' arg0 has btf_id 10966 type STRUCT 'bpf_iter_meta'
> > 4: R1=ctx() R6_w=trusted_ptr_bpf_iter_meta()
> > ; bpf_rcu_read_lock();
> > 4: (85) call bpf_rcu_read_lock#84184          ;
> > ; p = bpf_task_from_pid(pid);
> > 5: (18) r1 = 0xffffbc1ad3f72004       ;
> > R1_w=map_value(map=cpumask_.bss,ks=4,vs=8,off=4)
> > 7: (61) r1 = *(u32 *)(r1 +0)          ;
> > R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
> > ; p = bpf_task_from_pid(pid);
> > 8: (85) call bpf_task_from_pid#84204          ;
> > R0=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
> > 9: (bf) r8 = r0                       ;
> > R0=ptr_or_null_task_struct(id=3,ref_obj_id=3)
> > R8_w=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
> > 10: (b4) w7 = 1                       ; R7_w=1 refs=3
> > ; if (!p) {
> > 11: (15) if r8 == 0x0 goto pc+6       ;
> > R8_w=ptr_task_struct(ref_obj_id=3) refs=3
> > ; read_percpu_data(meta, cgrp, p->cpus_ptr);
> > 12: (79) r2 = *(u64 *)(r8 +984)       ; R2_w=rcu_ptr_cpumask()
> > R8_w=ptr_task_struct(ref_obj_id=3) refs=3
> > ; read_percpu_data(meta, cgrp, p->cpus_ptr);
> > 13: (bf) r1 = r6                      ;
> > R1_w=trusted_ptr_bpf_iter_meta() R6=trusted_ptr_bpf_iter_meta() refs=3
> > 14: (85) call pc+6
> > caller:
> >   R6=trusted_ptr_bpf_iter_meta() R7_w=1
> > R8_w=ptr_task_struct(ref_obj_id=3) R10=fp0 refs=3
> > callee:
> >   frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask() R10=fp0 refs=3
> > 21: frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask()
> > R10=fp0 refs=3
> > ; static inline void read_percpu_data(struct bpf_iter_meta *meta,
> > struct cgroup *cgrp, const cpumask_t *mask)
> > 21: (bf) r8 = r1                      ; frame1:
> > R1_w=trusted_ptr_bpf_iter_meta() R8_w=trusted_ptr_bpf_iter_meta()
> > refs=3
> > 22: (bf) r7 = r10                     ; frame1: R7_w=fp0 R10=fp0 refs=3
> > ;
> > 23: (07) r7 += -24                    ; frame1: R7_w=fp-24 refs=3
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 24: (bf) r1 = r7                      ; frame1: R1_w=fp-24 R7_w=fp-24 refs=3
> > 25: (85) call bpf_iter_cpumask_new#77163      ; frame1: R0=scalar()
> > fp-24=iter_cpumask(ref_id=4,state=active,depth=0) refs=3,4
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 26: (bf) r1 = r7                      ; frame1: R1=fp-24 R7=fp-24 refs=3,4
> > 27: (85) call bpf_iter_cpumask_next#77165     ; frame1: R0_w=0
> > fp-24=iter_cpumask(ref_id=4,state=drained,depth=0) refs=3,4
> > 28: (bf) r7 = r0                      ; frame1: R0_w=0 R7_w=0 refs=3,4
> > 29: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
> > 30: (63) *(u32 *)(r10 -40) = r1       ; frame1: R1_w=0 R10=fp0
> > fp-40=????0 refs=3,4
> > 31: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
> > 32: (7b) *(u64 *)(r10 -32) = r1       ; frame1: R1_w=0 R10=fp0
> > fp-32_w=0 refs=3,4
> > 33: (b4) w9 = 0                       ; frame1: R9_w=0 refs=3,4
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 34: (15) if r7 == 0x0 goto pc+57      ; frame1: R7_w=0 refs=3,4
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 92: (bf) r1 = r10                     ; frame1: R1_w=fp0 R10=fp0 refs=3,4
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 93: (07) r1 += -24                    ; frame1: R1_w=fp-24 refs=3,4
> > 94: (85) call bpf_iter_cpumask_destroy#77161          ; frame1: refs=3
> > ; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
> > 95: (61) r1 = *(u32 *)(r10 -40)       ; frame1: R1_w=0 R10=fp0
> > fp-40=????0 refs=3
> > 96: (bc) w1 = w1                      ; frame1: R1_w=0 refs=3
> > 97: (7b) *(u64 *)(r10 -8) = r1        ; frame1: R1_w=0 R10=fp0 fp-8_w=0 refs=3
> > 98: (79) r1 = *(u64 *)(r10 -32)       ; frame1: R1_w=0 R10=fp0 fp-32=0 refs=3
> > 99: (7b) *(u64 *)(r10 -16) = r1       ; frame1: R1_w=0 R10=fp0 fp-16_w=0 refs=3
> > 100: (7b) *(u64 *)(r10 -24) = r9      ; frame1: R9=0 R10=fp0 fp-24_w=0 refs=3
> > 101: (79) r1 = *(u64 *)(r8 +0)        ; frame1:
> > R1_w=trusted_ptr_seq_file() R8=trusted_ptr_bpf_iter_meta() refs=3
> > 102: (bf) r4 = r10                    ; frame1: R4_w=fp0 R10=fp0 refs=3
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 103: (07) r4 += -24                   ; frame1: R4_w=fp-24 refs=3
> > ; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
> > 104: (18) r2 = 0xffff9bce47e0e210     ; frame1:
> > R2_w=map_value(map=cpumask_.rodata,ks=4,vs=41) refs=3
> > 106: (b4) w3 = 41                     ; frame1: R3_w=41 refs=3
> > 107: (b4) w5 = 24                     ; frame1: R5_w=24 refs=3
> > 108: (85) call bpf_seq_printf#126     ; frame1: R0=scalar() refs=3
> > ; }
> > 109: (95) exit
> > bpf_rcu_read_unlock is missing
> > processed 45 insns (limit 1000000) max_states_per_insn 0 total_states
> > 5 peak_states 5 mark_read 3
> > =============
>
> The error is due to the following in verifier:
>
>                          } else if (opcode == BPF_EXIT) {
>                                 ...
>                                  if (env->cur_state->active_rcu_lock &&
>                                      !in_rbtree_lock_required_cb(env)) {
>                                          verbose(env, "bpf_rcu_read_unlock is missing\n");
>                                          return -EINVAL;
>                                  }
>
>
> I guess, we could relax the condition not to return -EINVAL if
> it is a static function.

Thanks for your suggestion.
It seems Kumar has already submitted a fix for it:
https://lore.kernel.org/bpf/20240204230231.1013964-1-memxor@gmail.com/

>
> >
> >
> > Another workaround is using the __always_inline :
> > static __always_inline void read_percpu_data(struct bpf_iter_meta
> > *meta, struct cgroup *cgrp, const cpumask_t *mask)
>
> __always_inline is also work. But let us improve verifier so we
> can avoid such workarounds in the future. Note that Kumar just
> submitted a patch set to relax spin_lock for static functions:
>    https://lore.kernel.org/bpf/20240204120206.796412-1-memxor@gmail.com/
>
> >
> >>> +SEC("iter.s/cgroup")
> >>> +int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta *meta, struct cgroup *cgrp)
> >>> +{
> >>> +       struct task_struct *p;
> >>> +
> >>> +       /* epilogue */
> >>> +       if (!cgrp)
> >>> +               return 0;
> >>> +
> >>> +       bpf_rcu_read_lock();
> >>> +       p = bpf_task_from_pid(pid);
> >>> +       if (!p) {
> >>> +               bpf_rcu_read_unlock();
> >>> +               return 1;
> >>> +       }
> >>> +
> >>> +       READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
> >>> +       bpf_task_release(p);
> >>> +       bpf_rcu_read_unlock();
> >>> +       return 0;
> >>> +}
> >>> +
> >> [...]
> >
> >
kernel test robot Feb. 5, 2024, 3:08 a.m. UTC | #5
Hi Yafang,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Yafang-Shao/bpf-Add-bpf_iter_cpumask-kfuncs/20240131-232406
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20240131145454.86990-5-laoar.shao%40gmail.com
patch subject: [PATCH v5 bpf-next 4/4] selftests/bpf: Add selftests for cpumask iter
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240205/202402051121.y4w06atm-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402051121.y4w06atm-lkp@intel.com/

All errors (new ones prefixed by >>):

>> progs/cpumask_iter_success.c:61:2: error: incomplete definition of type 'struct psi_group_cpu'
      61 |         READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   progs/cpumask_iter_success.c:39:27: note: expanded from macro 'READ_PERCPU_DATA'
      39 |                 psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
         |                                   ~~~~~~^
   progs/cpumask_iter_success.c:13:21: note: forward declaration of 'struct psi_group_cpu'
      13 | extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
         |                     ^
>> progs/cpumask_iter_success.c:61:2: error: use of undeclared identifier 'NR_RUNNING'; did you mean 'T_RUNNING'?
      61 |         READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
         |         ^
   progs/cpumask_iter_success.c:39:35: note: expanded from macro 'READ_PERCPU_DATA'
      39 |                 psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
         |                                                 ^
   /tools/include/vmlinux.h:28263:3: note: 'T_RUNNING' declared here
    28263 |                 T_RUNNING = 0,
          |                 ^
   progs/cpumask_iter_success.c:80:2: error: incomplete definition of type 'struct psi_group_cpu'
      80 |         READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   progs/cpumask_iter_success.c:39:27: note: expanded from macro 'READ_PERCPU_DATA'
      39 |                 psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
         |                                   ~~~~~~^
   progs/cpumask_iter_success.c:13:21: note: forward declaration of 'struct psi_group_cpu'
      13 | extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
         |                     ^
   progs/cpumask_iter_success.c:80:2: error: use of undeclared identifier 'NR_RUNNING'; did you mean 'T_RUNNING'?
      80 |         READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
         |         ^
   progs/cpumask_iter_success.c:39:35: note: expanded from macro 'READ_PERCPU_DATA'
      39 |                 psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
         |                                                 ^
   /tools/include/vmlinux.h:28263:3: note: 'T_RUNNING' declared here
    28263 |                 T_RUNNING = 0,
          |                 ^
   4 errors generated.
Yafang Shao Feb. 5, 2024, 1:14 p.m. UTC | #6
On Mon, Feb 5, 2024 at 11:09 AM kernel test robot <lkp@intel.com> wrote:
>
> Hi Yafang,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on bpf-next/master]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Yafang-Shao/bpf-Add-bpf_iter_cpumask-kfuncs/20240131-232406
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> patch link:    https://lore.kernel.org/r/20240131145454.86990-5-laoar.shao%40gmail.com
> patch subject: [PATCH v5 bpf-next 4/4] selftests/bpf: Add selftests for cpumask iter
> compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240205/202402051121.y4w06atm-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202402051121.y4w06atm-lkp@intel.com/

Thanks for your report.
It seems that the issue is caused by missing CONFIG_PSI=y.
The kernel config should align with tools/testing/selftests/bpf/config.

>
> All errors (new ones prefixed by >>):
>
> >> progs/cpumask_iter_success.c:61:2: error: incomplete definition of type 'struct psi_group_cpu'
>       61 |         READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
>          |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    progs/cpumask_iter_success.c:39:27: note: expanded from macro 'READ_PERCPU_DATA'
>       39 |                 psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
>          |                                   ~~~~~~^
>    progs/cpumask_iter_success.c:13:21: note: forward declaration of 'struct psi_group_cpu'
>       13 | extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
>          |                     ^
> >> progs/cpumask_iter_success.c:61:2: error: use of undeclared identifier 'NR_RUNNING'; did you mean 'T_RUNNING'?
>       61 |         READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
>          |         ^
>    progs/cpumask_iter_success.c:39:35: note: expanded from macro 'READ_PERCPU_DATA'
>       39 |                 psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
>          |                                                 ^
>    /tools/include/vmlinux.h:28263:3: note: 'T_RUNNING' declared here
>     28263 |                 T_RUNNING = 0,
>           |                 ^
>    progs/cpumask_iter_success.c:80:2: error: incomplete definition of type 'struct psi_group_cpu'
>       80 |         READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
>          |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    progs/cpumask_iter_success.c:39:27: note: expanded from macro 'READ_PERCPU_DATA'
>       39 |                 psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
>          |                                   ~~~~~~^
>    progs/cpumask_iter_success.c:13:21: note: forward declaration of 'struct psi_group_cpu'
>       13 | extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
>          |                     ^
>    progs/cpumask_iter_success.c:80:2: error: use of undeclared identifier 'NR_RUNNING'; did you mean 'T_RUNNING'?
>       80 |         READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
>          |         ^
>    progs/cpumask_iter_success.c:39:35: note: expanded from macro 'READ_PERCPU_DATA'
>       39 |                 psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
>          |                                                 ^
>    /tools/include/vmlinux.h:28263:3: note: 'T_RUNNING' declared here
>     28263 |                 T_RUNNING = 0,
>           |                 ^
>    4 errors generated.
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki
Andrii Nakryiko Feb. 5, 2024, 6:25 p.m. UTC | #7
On Sun, Feb 4, 2024 at 9:09 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 2/3/24 7:30 PM, Yafang Shao wrote:
> > On Sat, Feb 3, 2024 at 6:03 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> >> On Wed, Jan 31, 2024 at 6:55 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> >>> Add selftests for the newly added cpumask iter.
> >>> - cpumask_iter_success
> >>>    - The number of CPUs should be expected when iterating over the cpumask
> >>>    - percpu data extracted from the percpu struct should be expected
> >>>    - It can work in both non-sleepable and sleepable prog
> >>>    - RCU lock is only required by bpf_iter_cpumask_new()
> >>>    - It is fine without calling bpf_iter_cpumask_next()
> >>>
> >>> - cpumask_iter_failure
> >>>    - RCU lock is required in sleepable prog
> >>>    - The cpumask to be iterated over can't be NULL
> >>>    - bpf_iter_cpumask_destroy() is required after calling
> >>>      bpf_iter_cpumask_new()
> >>>    - bpf_iter_cpumask_destroy() can only destroy an initilialized iter
> >>>    - bpf_iter_cpumask_next() must use an initilialized iter
> >> typos: initialized
> > will fix it.
> >
> >>> The result as follows,
> >>>
> >>>    #64/37   cpumask/test_cpumask_iter:OK
> >>>    #64/38   cpumask/test_cpumask_iter_sleepable:OK
> >>>    #64/39   cpumask/test_cpumask_iter_sleepable:OK
> >>>    #64/40   cpumask/test_cpumask_iter_next_no_rcu:OK
> >>>    #64/41   cpumask/test_cpumask_iter_no_next:OK
> >>>    #64/42   cpumask/test_cpumask_iter:OK
> >>>    #64/43   cpumask/test_cpumask_iter_no_rcu:OK
> >>>    #64/44   cpumask/test_cpumask_iter_no_destroy:OK
> >>>    #64/45   cpumask/test_cpumask_iter_null_pointer:OK
> >>>    #64/46   cpumask/test_cpumask_iter_next_uninit:OK
> >>>    #64/47   cpumask/test_cpumask_iter_destroy_uninit:OK
> >>>    #64      cpumask:OK
> >>>
> >>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> >>> ---
> >>>   tools/testing/selftests/bpf/config            |   1 +
> >>>   .../selftests/bpf/prog_tests/cpumask.c        | 152 ++++++++++++++++++
> >>>   .../selftests/bpf/progs/cpumask_common.h      |   3 +
> >>>   .../bpf/progs/cpumask_iter_failure.c          |  99 ++++++++++++
> >>>   .../bpf/progs/cpumask_iter_success.c          | 126 +++++++++++++++
> >>>   5 files changed, 381 insertions(+)
> >>>   create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
> >>>   create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> >>>
> >> LGTM overall, except for seemingly unnecessary use of a big macro
> >>
> >>> diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
> >>> index 0cd4aebb97cf..cdb9dc95e9d9 100644
> >>> --- a/tools/testing/selftests/bpf/progs/cpumask_common.h
> >>> +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
> >>> @@ -55,6 +55,9 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
> >>>   u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
> >>>   u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
> >>>   u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
> >>> +int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask) __ksym;
> >>> +int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
> >>> +void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;
> >> let's mark them __weak so they don't conflict with definitions that
> >> will eventually come from vmlinux.h (that applies to all the kfunc
> >> definitions we currently have and we'll need to clean all that up, but
> >> let's not add non-weak kfuncs going forward)
> > will change it.
> >
> >>>   void bpf_rcu_read_lock(void) __ksym;
> >>>   void bpf_rcu_read_unlock(void) __ksym;
> >> [...]
> >>
> >>> diff --git a/tools/testing/selftests/bpf/progs/cpumask_iter_success.c b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> >>> new file mode 100644
> >>> index 000000000000..4ce14ef98451
> >>> --- /dev/null
> >>> +++ b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> >>> @@ -0,0 +1,126 @@
> >>> +// SPDX-License-Identifier: GPL-2.0-only
> >>> +/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
> >>> +
> >>> +#include "vmlinux.h"
> >>> +#include <bpf/bpf_helpers.h>
> >>> +#include <bpf/bpf_tracing.h>
> >>> +
> >>> +#include "task_kfunc_common.h"
> >>> +#include "cpumask_common.h"
> >>> +
> >>> +char _license[] SEC("license") = "GPL";
> >>> +
> >>> +extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
> >>> +extern const struct rq runqueues __ksym __weak;
> >>> +
> >>> +int pid;
> >>> +
> >>> +#define READ_PERCPU_DATA(meta, cgrp, mask)                                                     \
> >>> +{                                                                                              \
> >>> +       u32 nr_running = 0, psi_nr_running = 0, nr_cpus = 0;                                    \
> >>> +       struct psi_group_cpu *groupc;                                                           \
> >>> +       struct rq *rq;                                                                          \
> >>> +       int *cpu;                                                                               \
> >>> +                                                                                               \
> >>> +       bpf_for_each(cpumask, cpu, mask) {                                                      \
> >>> +               rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);                            \
> >>> +               if (!rq) {                                                                      \
> >>> +                       err += 1;                                                               \
> >>> +                       continue;                                                               \
> >>> +               }                                                                               \
> >>> +               nr_running += rq->nr_running;                                                   \
> >>> +               nr_cpus += 1;                                                                   \
> >>> +                                                                                               \
> >>> +               groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);     \
> >>> +               if (!groupc) {                                                                  \
> >>> +                       err += 1;                                                               \
> >>> +                       continue;                                                               \
> >>> +               }                                                                               \
> >>> +               psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
> >>> +       }                                                                                       \
> >>> +       BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",                  \
> >>> +                      nr_running, nr_cpus, psi_nr_running);                                    \
> >>> +}
> >>> +
> >> Does this have to be a gigantic macro? Why can't it be just a function?
> > It seems that the verifier can't identify a function call between
> > bpf_rcu_read_lock() and bpf_rcu_read_unlock().
> > That said, if there's a function call between them, the verifier will fail.
> > Below is the full verifier log if I define it as :
> > static inline void read_percpu_data(struct bpf_iter_meta *meta, struct
> > cgroup *cgrp, const cpumask_t *mask)
> >
> > VERIFIER LOG:
> > =============
> > 0: R1=ctx() R10=fp0
> > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > *meta, struct cgroup *cgrp)
> > 0: (b4) w7 = 0                        ; R7_w=0
> > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > *meta, struct cgroup *cgrp)
> > 1: (79) r2 = *(u64 *)(r1 +8)          ; R1=ctx()
> > R2_w=trusted_ptr_or_null_cgroup(id=1)
> > ; if (!cgrp)
> > 2: (15) if r2 == 0x0 goto pc+16       ; R2_w=trusted_ptr_cgroup()
> > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > *meta, struct cgroup *cgrp)
> > 3: (79) r6 = *(u64 *)(r1 +0)
> > func 'bpf_iter_cgroup' arg0 has btf_id 10966 type STRUCT 'bpf_iter_meta'
> > 4: R1=ctx() R6_w=trusted_ptr_bpf_iter_meta()
> > ; bpf_rcu_read_lock();
> > 4: (85) call bpf_rcu_read_lock#84184          ;
> > ; p = bpf_task_from_pid(pid);
> > 5: (18) r1 = 0xffffbc1ad3f72004       ;
> > R1_w=map_value(map=cpumask_.bss,ks=4,vs=8,off=4)
> > 7: (61) r1 = *(u32 *)(r1 +0)          ;
> > R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
> > ; p = bpf_task_from_pid(pid);
> > 8: (85) call bpf_task_from_pid#84204          ;
> > R0=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
> > 9: (bf) r8 = r0                       ;
> > R0=ptr_or_null_task_struct(id=3,ref_obj_id=3)
> > R8_w=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
> > 10: (b4) w7 = 1                       ; R7_w=1 refs=3
> > ; if (!p) {
> > 11: (15) if r8 == 0x0 goto pc+6       ;
> > R8_w=ptr_task_struct(ref_obj_id=3) refs=3
> > ; read_percpu_data(meta, cgrp, p->cpus_ptr);
> > 12: (79) r2 = *(u64 *)(r8 +984)       ; R2_w=rcu_ptr_cpumask()
> > R8_w=ptr_task_struct(ref_obj_id=3) refs=3
> > ; read_percpu_data(meta, cgrp, p->cpus_ptr);
> > 13: (bf) r1 = r6                      ;
> > R1_w=trusted_ptr_bpf_iter_meta() R6=trusted_ptr_bpf_iter_meta() refs=3
> > 14: (85) call pc+6
> > caller:
> >   R6=trusted_ptr_bpf_iter_meta() R7_w=1
> > R8_w=ptr_task_struct(ref_obj_id=3) R10=fp0 refs=3
> > callee:
> >   frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask() R10=fp0 refs=3
> > 21: frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask()
> > R10=fp0 refs=3
> > ; static inline void read_percpu_data(struct bpf_iter_meta *meta,
> > struct cgroup *cgrp, const cpumask_t *mask)
> > 21: (bf) r8 = r1                      ; frame1:
> > R1_w=trusted_ptr_bpf_iter_meta() R8_w=trusted_ptr_bpf_iter_meta()
> > refs=3
> > 22: (bf) r7 = r10                     ; frame1: R7_w=fp0 R10=fp0 refs=3
> > ;
> > 23: (07) r7 += -24                    ; frame1: R7_w=fp-24 refs=3
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 24: (bf) r1 = r7                      ; frame1: R1_w=fp-24 R7_w=fp-24 refs=3
> > 25: (85) call bpf_iter_cpumask_new#77163      ; frame1: R0=scalar()
> > fp-24=iter_cpumask(ref_id=4,state=active,depth=0) refs=3,4
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 26: (bf) r1 = r7                      ; frame1: R1=fp-24 R7=fp-24 refs=3,4
> > 27: (85) call bpf_iter_cpumask_next#77165     ; frame1: R0_w=0
> > fp-24=iter_cpumask(ref_id=4,state=drained,depth=0) refs=3,4
> > 28: (bf) r7 = r0                      ; frame1: R0_w=0 R7_w=0 refs=3,4
> > 29: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
> > 30: (63) *(u32 *)(r10 -40) = r1       ; frame1: R1_w=0 R10=fp0
> > fp-40=????0 refs=3,4
> > 31: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
> > 32: (7b) *(u64 *)(r10 -32) = r1       ; frame1: R1_w=0 R10=fp0
> > fp-32_w=0 refs=3,4
> > 33: (b4) w9 = 0                       ; frame1: R9_w=0 refs=3,4
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 34: (15) if r7 == 0x0 goto pc+57      ; frame1: R7_w=0 refs=3,4
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 92: (bf) r1 = r10                     ; frame1: R1_w=fp0 R10=fp0 refs=3,4
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 93: (07) r1 += -24                    ; frame1: R1_w=fp-24 refs=3,4
> > 94: (85) call bpf_iter_cpumask_destroy#77161          ; frame1: refs=3
> > ; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
> > 95: (61) r1 = *(u32 *)(r10 -40)       ; frame1: R1_w=0 R10=fp0
> > fp-40=????0 refs=3
> > 96: (bc) w1 = w1                      ; frame1: R1_w=0 refs=3
> > 97: (7b) *(u64 *)(r10 -8) = r1        ; frame1: R1_w=0 R10=fp0 fp-8_w=0 refs=3
> > 98: (79) r1 = *(u64 *)(r10 -32)       ; frame1: R1_w=0 R10=fp0 fp-32=0 refs=3
> > 99: (7b) *(u64 *)(r10 -16) = r1       ; frame1: R1_w=0 R10=fp0 fp-16_w=0 refs=3
> > 100: (7b) *(u64 *)(r10 -24) = r9      ; frame1: R9=0 R10=fp0 fp-24_w=0 refs=3
> > 101: (79) r1 = *(u64 *)(r8 +0)        ; frame1:
> > R1_w=trusted_ptr_seq_file() R8=trusted_ptr_bpf_iter_meta() refs=3
> > 102: (bf) r4 = r10                    ; frame1: R4_w=fp0 R10=fp0 refs=3
> > ; bpf_for_each(cpumask, cpu, mask) {
> > 103: (07) r4 += -24                   ; frame1: R4_w=fp-24 refs=3
> > ; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
> > 104: (18) r2 = 0xffff9bce47e0e210     ; frame1:
> > R2_w=map_value(map=cpumask_.rodata,ks=4,vs=41) refs=3
> > 106: (b4) w3 = 41                     ; frame1: R3_w=41 refs=3
> > 107: (b4) w5 = 24                     ; frame1: R5_w=24 refs=3
> > 108: (85) call bpf_seq_printf#126     ; frame1: R0=scalar() refs=3
> > ; }
> > 109: (95) exit
> > bpf_rcu_read_unlock is missing
> > processed 45 insns (limit 1000000) max_states_per_insn 0 total_states
> > 5 peak_states 5 mark_read 3
> > =============
>
> The error is due to the following in verifier:
>
>                          } else if (opcode == BPF_EXIT) {
>                                 ...
>                                  if (env->cur_state->active_rcu_lock &&
>                                      !in_rbtree_lock_required_cb(env)) {
>                                          verbose(env, "bpf_rcu_read_unlock is missing\n");
>                                          return -EINVAL;
>                                  }
>
>
> I guess, we could relax the condition not to return -EINVAL if
> it is a static function.
>
> >
> >
> > Another workaround is using the __always_inline :
> > static __always_inline void read_percpu_data(struct bpf_iter_meta
> > *meta, struct cgroup *cgrp, const cpumask_t *mask)
>
> __always_inline is also work. But let us improve verifier so we
> can avoid such workarounds in the future. Note that Kumar just
> submitted a patch set to relax spin_lock for static functions:
>    https://lore.kernel.org/bpf/20240204120206.796412-1-memxor@gmail.com/


Agreed, let's improve verifier, but I wouldn't block on it for this
patch set and just use __always_inline for now.

I think we should also work on extending this RCU support to global
functions. We can add per-function annotation (similar to __arg_xxx,
but which will be applied to a function itself), just __rcu or
something like __func_assume_rcu or whatnot, and then there should be
no difference between static and global functions in this regard.

In general, global functions are basically mandatory nowadays for big
production BPF programs (to reduce verification complexity), so we
should strive to keep global functions/subprogs on par with static
subprogs as much as we can.

>
> >
> >>> +SEC("iter.s/cgroup")
> >>> +int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta *meta, struct cgroup *cgrp)
> >>> +{
> >>> +       struct task_struct *p;
> >>> +
> >>> +       /* epilogue */
> >>> +       if (!cgrp)
> >>> +               return 0;
> >>> +
> >>> +       bpf_rcu_read_lock();
> >>> +       p = bpf_task_from_pid(pid);
> >>> +       if (!p) {
> >>> +               bpf_rcu_read_unlock();
> >>> +               return 1;
> >>> +       }
> >>> +
> >>> +       READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
> >>> +       bpf_task_release(p);
> >>> +       bpf_rcu_read_unlock();
> >>> +       return 0;
> >>> +}
> >>> +
> >> [...]
> >
> >
Yafang Shao Feb. 6, 2024, 7:38 a.m. UTC | #8
On Tue, Feb 6, 2024 at 2:25 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Sun, Feb 4, 2024 at 9:09 AM Yonghong Song <yonghong.song@linux.dev> wrote:
> >
> >
> > On 2/3/24 7:30 PM, Yafang Shao wrote:
> > > On Sat, Feb 3, 2024 at 6:03 AM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > >> On Wed, Jan 31, 2024 at 6:55 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> > >>> Add selftests for the newly added cpumask iter.
> > >>> - cpumask_iter_success
> > >>>    - The number of CPUs should be expected when iterating over the cpumask
> > >>>    - percpu data extracted from the percpu struct should be expected
> > >>>    - It can work in both non-sleepable and sleepable prog
> > >>>    - RCU lock is only required by bpf_iter_cpumask_new()
> > >>>    - It is fine without calling bpf_iter_cpumask_next()
> > >>>
> > >>> - cpumask_iter_failure
> > >>>    - RCU lock is required in sleepable prog
> > >>>    - The cpumask to be iterated over can't be NULL
> > >>>    - bpf_iter_cpumask_destroy() is required after calling
> > >>>      bpf_iter_cpumask_new()
> > >>>    - bpf_iter_cpumask_destroy() can only destroy an initilialized iter
> > >>>    - bpf_iter_cpumask_next() must use an initilialized iter
> > >> typos: initialized
> > > will fix it.
> > >
> > >>> The result as follows,
> > >>>
> > >>>    #64/37   cpumask/test_cpumask_iter:OK
> > >>>    #64/38   cpumask/test_cpumask_iter_sleepable:OK
> > >>>    #64/39   cpumask/test_cpumask_iter_sleepable:OK
> > >>>    #64/40   cpumask/test_cpumask_iter_next_no_rcu:OK
> > >>>    #64/41   cpumask/test_cpumask_iter_no_next:OK
> > >>>    #64/42   cpumask/test_cpumask_iter:OK
> > >>>    #64/43   cpumask/test_cpumask_iter_no_rcu:OK
> > >>>    #64/44   cpumask/test_cpumask_iter_no_destroy:OK
> > >>>    #64/45   cpumask/test_cpumask_iter_null_pointer:OK
> > >>>    #64/46   cpumask/test_cpumask_iter_next_uninit:OK
> > >>>    #64/47   cpumask/test_cpumask_iter_destroy_uninit:OK
> > >>>    #64      cpumask:OK
> > >>>
> > >>> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > >>> ---
> > >>>   tools/testing/selftests/bpf/config            |   1 +
> > >>>   .../selftests/bpf/prog_tests/cpumask.c        | 152 ++++++++++++++++++
> > >>>   .../selftests/bpf/progs/cpumask_common.h      |   3 +
> > >>>   .../bpf/progs/cpumask_iter_failure.c          |  99 ++++++++++++
> > >>>   .../bpf/progs/cpumask_iter_success.c          | 126 +++++++++++++++
> > >>>   5 files changed, 381 insertions(+)
> > >>>   create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
> > >>>   create mode 100644 tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> > >>>
> > >> LGTM overall, except for seemingly unnecessary use of a big macro
> > >>
> > >>> diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
> > >>> index 0cd4aebb97cf..cdb9dc95e9d9 100644
> > >>> --- a/tools/testing/selftests/bpf/progs/cpumask_common.h
> > >>> +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
> > >>> @@ -55,6 +55,9 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
> > >>>   u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
> > >>>   u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
> > >>>   u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
> > >>> +int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask) __ksym;
> > >>> +int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
> > >>> +void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;
> > >> let's mark them __weak so they don't conflict with definitions that
> > >> will eventually come from vmlinux.h (that applies to all the kfunc
> > >> definitions we currently have and we'll need to clean all that up, but
> > >> let's not add non-weak kfuncs going forward)
> > > will change it.
> > >
> > >>>   void bpf_rcu_read_lock(void) __ksym;
> > >>>   void bpf_rcu_read_unlock(void) __ksym;
> > >> [...]
> > >>
> > >>> diff --git a/tools/testing/selftests/bpf/progs/cpumask_iter_success.c b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> > >>> new file mode 100644
> > >>> index 000000000000..4ce14ef98451
> > >>> --- /dev/null
> > >>> +++ b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
> > >>> @@ -0,0 +1,126 @@
> > >>> +// SPDX-License-Identifier: GPL-2.0-only
> > >>> +/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
> > >>> +
> > >>> +#include "vmlinux.h"
> > >>> +#include <bpf/bpf_helpers.h>
> > >>> +#include <bpf/bpf_tracing.h>
> > >>> +
> > >>> +#include "task_kfunc_common.h"
> > >>> +#include "cpumask_common.h"
> > >>> +
> > >>> +char _license[] SEC("license") = "GPL";
> > >>> +
> > >>> +extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
> > >>> +extern const struct rq runqueues __ksym __weak;
> > >>> +
> > >>> +int pid;
> > >>> +
> > >>> +#define READ_PERCPU_DATA(meta, cgrp, mask)                                                     \
> > >>> +{                                                                                              \
> > >>> +       u32 nr_running = 0, psi_nr_running = 0, nr_cpus = 0;                                    \
> > >>> +       struct psi_group_cpu *groupc;                                                           \
> > >>> +       struct rq *rq;                                                                          \
> > >>> +       int *cpu;                                                                               \
> > >>> +                                                                                               \
> > >>> +       bpf_for_each(cpumask, cpu, mask) {                                                      \
> > >>> +               rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);                            \
> > >>> +               if (!rq) {                                                                      \
> > >>> +                       err += 1;                                                               \
> > >>> +                       continue;                                                               \
> > >>> +               }                                                                               \
> > >>> +               nr_running += rq->nr_running;                                                   \
> > >>> +               nr_cpus += 1;                                                                   \
> > >>> +                                                                                               \
> > >>> +               groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);     \
> > >>> +               if (!groupc) {                                                                  \
> > >>> +                       err += 1;                                                               \
> > >>> +                       continue;                                                               \
> > >>> +               }                                                                               \
> > >>> +               psi_nr_running += groupc->tasks[NR_RUNNING];                                    \
> > >>> +       }                                                                                       \
> > >>> +       BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",                  \
> > >>> +                      nr_running, nr_cpus, psi_nr_running);                                    \
> > >>> +}
> > >>> +
> > >> Does this have to be a gigantic macro? Why can't it be just a function?
> > > It seems that the verifier can't identify a function call between
> > > bpf_rcu_read_lock() and bpf_rcu_read_unlock().
> > > That said, if there's a function call between them, the verifier will fail.
> > > Below is the full verifier log if I define it as :
> > > static inline void read_percpu_data(struct bpf_iter_meta *meta, struct
> > > cgroup *cgrp, const cpumask_t *mask)
> > >
> > > VERIFIER LOG:
> > > =============
> > > 0: R1=ctx() R10=fp0
> > > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > > *meta, struct cgroup *cgrp)
> > > 0: (b4) w7 = 0                        ; R7_w=0
> > > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > > *meta, struct cgroup *cgrp)
> > > 1: (79) r2 = *(u64 *)(r1 +8)          ; R1=ctx()
> > > R2_w=trusted_ptr_or_null_cgroup(id=1)
> > > ; if (!cgrp)
> > > 2: (15) if r2 == 0x0 goto pc+16       ; R2_w=trusted_ptr_cgroup()
> > > ; int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta
> > > *meta, struct cgroup *cgrp)
> > > 3: (79) r6 = *(u64 *)(r1 +0)
> > > func 'bpf_iter_cgroup' arg0 has btf_id 10966 type STRUCT 'bpf_iter_meta'
> > > 4: R1=ctx() R6_w=trusted_ptr_bpf_iter_meta()
> > > ; bpf_rcu_read_lock();
> > > 4: (85) call bpf_rcu_read_lock#84184          ;
> > > ; p = bpf_task_from_pid(pid);
> > > 5: (18) r1 = 0xffffbc1ad3f72004       ;
> > > R1_w=map_value(map=cpumask_.bss,ks=4,vs=8,off=4)
> > > 7: (61) r1 = *(u32 *)(r1 +0)          ;
> > > R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
> > > ; p = bpf_task_from_pid(pid);
> > > 8: (85) call bpf_task_from_pid#84204          ;
> > > R0=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
> > > 9: (bf) r8 = r0                       ;
> > > R0=ptr_or_null_task_struct(id=3,ref_obj_id=3)
> > > R8_w=ptr_or_null_task_struct(id=3,ref_obj_id=3) refs=3
> > > 10: (b4) w7 = 1                       ; R7_w=1 refs=3
> > > ; if (!p) {
> > > 11: (15) if r8 == 0x0 goto pc+6       ;
> > > R8_w=ptr_task_struct(ref_obj_id=3) refs=3
> > > ; read_percpu_data(meta, cgrp, p->cpus_ptr);
> > > 12: (79) r2 = *(u64 *)(r8 +984)       ; R2_w=rcu_ptr_cpumask()
> > > R8_w=ptr_task_struct(ref_obj_id=3) refs=3
> > > ; read_percpu_data(meta, cgrp, p->cpus_ptr);
> > > 13: (bf) r1 = r6                      ;
> > > R1_w=trusted_ptr_bpf_iter_meta() R6=trusted_ptr_bpf_iter_meta() refs=3
> > > 14: (85) call pc+6
> > > caller:
> > >   R6=trusted_ptr_bpf_iter_meta() R7_w=1
> > > R8_w=ptr_task_struct(ref_obj_id=3) R10=fp0 refs=3
> > > callee:
> > >   frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask() R10=fp0 refs=3
> > > 21: frame1: R1_w=trusted_ptr_bpf_iter_meta() R2_w=rcu_ptr_cpumask()
> > > R10=fp0 refs=3
> > > ; static inline void read_percpu_data(struct bpf_iter_meta *meta,
> > > struct cgroup *cgrp, const cpumask_t *mask)
> > > 21: (bf) r8 = r1                      ; frame1:
> > > R1_w=trusted_ptr_bpf_iter_meta() R8_w=trusted_ptr_bpf_iter_meta()
> > > refs=3
> > > 22: (bf) r7 = r10                     ; frame1: R7_w=fp0 R10=fp0 refs=3
> > > ;
> > > 23: (07) r7 += -24                    ; frame1: R7_w=fp-24 refs=3
> > > ; bpf_for_each(cpumask, cpu, mask) {
> > > 24: (bf) r1 = r7                      ; frame1: R1_w=fp-24 R7_w=fp-24 refs=3
> > > 25: (85) call bpf_iter_cpumask_new#77163      ; frame1: R0=scalar()
> > > fp-24=iter_cpumask(ref_id=4,state=active,depth=0) refs=3,4
> > > ; bpf_for_each(cpumask, cpu, mask) {
> > > 26: (bf) r1 = r7                      ; frame1: R1=fp-24 R7=fp-24 refs=3,4
> > > 27: (85) call bpf_iter_cpumask_next#77165     ; frame1: R0_w=0
> > > fp-24=iter_cpumask(ref_id=4,state=drained,depth=0) refs=3,4
> > > 28: (bf) r7 = r0                      ; frame1: R0_w=0 R7_w=0 refs=3,4
> > > 29: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
> > > 30: (63) *(u32 *)(r10 -40) = r1       ; frame1: R1_w=0 R10=fp0
> > > fp-40=????0 refs=3,4
> > > 31: (b4) w1 = 0                       ; frame1: R1_w=0 refs=3,4
> > > 32: (7b) *(u64 *)(r10 -32) = r1       ; frame1: R1_w=0 R10=fp0
> > > fp-32_w=0 refs=3,4
> > > 33: (b4) w9 = 0                       ; frame1: R9_w=0 refs=3,4
> > > ; bpf_for_each(cpumask, cpu, mask) {
> > > 34: (15) if r7 == 0x0 goto pc+57      ; frame1: R7_w=0 refs=3,4
> > > ; bpf_for_each(cpumask, cpu, mask) {
> > > 92: (bf) r1 = r10                     ; frame1: R1_w=fp0 R10=fp0 refs=3,4
> > > ; bpf_for_each(cpumask, cpu, mask) {
> > > 93: (07) r1 += -24                    ; frame1: R1_w=fp-24 refs=3,4
> > > 94: (85) call bpf_iter_cpumask_destroy#77161          ; frame1: refs=3
> > > ; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
> > > 95: (61) r1 = *(u32 *)(r10 -40)       ; frame1: R1_w=0 R10=fp0
> > > fp-40=????0 refs=3
> > > 96: (bc) w1 = w1                      ; frame1: R1_w=0 refs=3
> > > 97: (7b) *(u64 *)(r10 -8) = r1        ; frame1: R1_w=0 R10=fp0 fp-8_w=0 refs=3
> > > 98: (79) r1 = *(u64 *)(r10 -32)       ; frame1: R1_w=0 R10=fp0 fp-32=0 refs=3
> > > 99: (7b) *(u64 *)(r10 -16) = r1       ; frame1: R1_w=0 R10=fp0 fp-16_w=0 refs=3
> > > 100: (7b) *(u64 *)(r10 -24) = r9      ; frame1: R9=0 R10=fp0 fp-24_w=0 refs=3
> > > 101: (79) r1 = *(u64 *)(r8 +0)        ; frame1:
> > > R1_w=trusted_ptr_seq_file() R8=trusted_ptr_bpf_iter_meta() refs=3
> > > 102: (bf) r4 = r10                    ; frame1: R4_w=fp0 R10=fp0 refs=3
> > > ; bpf_for_each(cpumask, cpu, mask) {
> > > 103: (07) r4 += -24                   ; frame1: R4_w=fp-24 refs=3
> > > ; BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",
> > > 104: (18) r2 = 0xffff9bce47e0e210     ; frame1:
> > > R2_w=map_value(map=cpumask_.rodata,ks=4,vs=41) refs=3
> > > 106: (b4) w3 = 41                     ; frame1: R3_w=41 refs=3
> > > 107: (b4) w5 = 24                     ; frame1: R5_w=24 refs=3
> > > 108: (85) call bpf_seq_printf#126     ; frame1: R0=scalar() refs=3
> > > ; }
> > > 109: (95) exit
> > > bpf_rcu_read_unlock is missing
> > > processed 45 insns (limit 1000000) max_states_per_insn 0 total_states
> > > 5 peak_states 5 mark_read 3
> > > =============
> >
> > The error is due to the following in verifier:
> >
> >                          } else if (opcode == BPF_EXIT) {
> >                                 ...
> >                                  if (env->cur_state->active_rcu_lock &&
> >                                      !in_rbtree_lock_required_cb(env)) {
> >                                          verbose(env, "bpf_rcu_read_unlock is missing\n");
> >                                          return -EINVAL;
> >                                  }
> >
> >
> > I guess, we could relax the condition not to return -EINVAL if
> > it is a static function.
> >
> > >
> > >
> > > Another workaround is using the __always_inline :
> > > static __always_inline void read_percpu_data(struct bpf_iter_meta
> > > *meta, struct cgroup *cgrp, const cpumask_t *mask)
> >
> > __always_inline is also work. But let us improve verifier so we
> > can avoid such workarounds in the future. Note that Kumar just
> > submitted a patch set to relax spin_lock for static functions:
> >    https://lore.kernel.org/bpf/20240204120206.796412-1-memxor@gmail.com/
>
>
> Agreed, let's improve verifier, but I wouldn't block on it for this
> patch set and just use __always_inline for now.

OK. will do it.

>
> I think we should also work on extending this RCU support to global
> functions. We can add per-function annotation (similar to __arg_xxx,
> but which will be applied to a function itself), just __rcu or
> something like __func_assume_rcu or whatnot, and then there should be
> no difference between static and global functions in this regard.
>
> In general, global functions are basically mandatory nowadays for big
> production BPF programs (to reduce verification complexity), so we
> should strive to keep global functions/subprogs on par with static
> subprogs as much as we can.

Understood. Thanks for your explanation.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 01f241ea2c67..dd4b0935e35f 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -78,6 +78,7 @@  CONFIG_NF_CONNTRACK_MARK=y
 CONFIG_NF_DEFRAG_IPV4=y
 CONFIG_NF_DEFRAG_IPV6=y
 CONFIG_NF_NAT=y
+CONFIG_PSI=y
 CONFIG_RC_CORE=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask.c b/tools/testing/selftests/bpf/prog_tests/cpumask.c
index ecf89df78109..78423745689c 100644
--- a/tools/testing/selftests/bpf/prog_tests/cpumask.c
+++ b/tools/testing/selftests/bpf/prog_tests/cpumask.c
@@ -1,9 +1,15 @@ 
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
 
+#define _GNU_SOURCE
+#include <sched.h>
+
 #include <test_progs.h>
 #include "cpumask_failure.skel.h"
 #include "cpumask_success.skel.h"
+#include "cpumask_iter_success.skel.h"
+#include "cpumask_iter_failure.skel.h"
+#include "cgroup_helpers.h"
 
 static const char * const cpumask_success_testcases[] = {
 	"test_alloc_free_cpumask",
@@ -61,6 +67,142 @@  static void verify_success(const char *prog_name)
 	cpumask_success__destroy(skel);
 }
 
+static const char * const cpumask_iter_success_testcases[] = {
+	"test_cpumask_iter",
+	"test_cpumask_iter_sleepable",
+};
+
+static int read_percpu_data(struct bpf_link *link, int nr_cpu_exp, int nr_running_exp)
+{
+	int iter_fd, len, item, nr_running, psi_running, nr_cpus, err = -1;
+	char buf[128];
+	size_t left;
+	char *p;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (!ASSERT_GE(iter_fd, 0, "iter_fd"))
+		return -1;
+
+	memset(buf, 0, sizeof(buf));
+	left = ARRAY_SIZE(buf);
+	p = buf;
+	while ((len = read(iter_fd, p, left)) > 0) {
+		p += len;
+		left -= len;
+	}
+
+	item = sscanf(buf, "nr_running %u nr_cpus %u psi_running %u\n",
+		      &nr_running, &nr_cpus, &psi_running);
+	if (!ASSERT_EQ(item, 3, "seq_format"))
+		goto out;
+	if (!ASSERT_EQ(nr_cpus, nr_cpu_exp, "nr_cpus"))
+		goto out;
+	if (!ASSERT_GE(nr_running, nr_running_exp, "nr_running"))
+		goto out;
+	if (!ASSERT_GE(psi_running, nr_running_exp, "psi_running"))
+		goto out;
+
+	err = 0;
+out:
+	close(iter_fd);
+	return err;
+}
+
+static void verify_iter_success(const char *prog_name)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	int cgrp_fd, nr_cpus, err, i, chosen = 0;
+	struct cpumask_iter_success *skel;
+	union bpf_iter_link_info linfo;
+	struct bpf_program *prog;
+	struct bpf_link *link;
+	cpu_set_t set;
+
+	if (setup_cgroup_environment())
+		return;
+
+	/* Utilize the cgroup iter */
+	cgrp_fd = get_root_cgroup();
+	if (!ASSERT_GE(cgrp_fd, 0, "create_cgrp"))
+		goto cleanup;
+
+	skel = cpumask_iter_success__open();
+	if (!ASSERT_OK_PTR(skel, "cpumask_iter_success__open"))
+		goto close_fd;
+
+	skel->bss->pid = getpid();
+	nr_cpus = libbpf_num_possible_cpus();
+
+	err = cpumask_iter_success__load(skel);
+	if (!ASSERT_OK(err, "cpumask_iter_success__load"))
+		goto destroy;
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto destroy;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.cgroup.cgroup_fd = cgrp_fd;
+	linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+	link = bpf_program__attach_iter(prog, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+		goto destroy;
+
+	/* Case 1): Enable all possible CPUs */
+	CPU_ZERO(&set);
+	for (i = 0; i < nr_cpus; i++)
+		CPU_SET(i, &set);
+	err = sched_setaffinity(skel->bss->pid, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity_all_cpus"))
+		goto free_link;
+	err = read_percpu_data(link, nr_cpus, 1);
+	if (!ASSERT_OK(err, "read_percpu_data"))
+		goto free_link;
+	if (!ASSERT_OK(skel->bss->err, "null_rq"))
+		goto free_link;
+
+	/* Case 2): CPU0 only */
+	CPU_ZERO(&set);
+	CPU_SET(0, &set);
+	err = sched_setaffinity(skel->bss->pid, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity_cpu0"))
+		goto free_link;
+	err = read_percpu_data(link, 1, 1);
+	if (!ASSERT_OK(err, "read_percpu_data"))
+		goto free_link;
+	if (!ASSERT_OK(skel->bss->err, "null_rq_psi"))
+		goto free_link;
+
+	/* Case 3): Partial CPUs */
+	CPU_ZERO(&set);
+	for (i = 0; i < nr_cpus; i++) {
+		if (i < 4 && i & 0x1)
+			continue;
+		if (i > 8 && i & 0x2)
+			continue;
+		CPU_SET(i, &set);
+		chosen++;
+	}
+	err = sched_setaffinity(skel->bss->pid, sizeof(set), &set);
+	if (!ASSERT_OK(err, "setaffinity_partial_cpus"))
+		goto free_link;
+	err = read_percpu_data(link, chosen, 1);
+	if (!ASSERT_OK(err, "read_percpu_data"))
+		goto free_link;
+	ASSERT_OK(skel->bss->err, "null_rq_psi");
+
+free_link:
+	bpf_link__destroy(link);
+destroy:
+	cpumask_iter_success__destroy(skel);
+close_fd:
+	close(cgrp_fd);
+cleanup:
+	cleanup_cgroup_environment();
+}
+
 void test_cpumask(void)
 {
 	int i;
@@ -74,4 +216,14 @@  void test_cpumask(void)
 
 	RUN_TESTS(cpumask_success);
 	RUN_TESTS(cpumask_failure);
+
+	for (i = 0; i < ARRAY_SIZE(cpumask_iter_success_testcases); i++) {
+		if (!test__start_subtest(cpumask_iter_success_testcases[i]))
+			continue;
+
+		verify_iter_success(cpumask_iter_success_testcases[i]);
+	}
+
+	RUN_TESTS(cpumask_iter_success);
+	RUN_TESTS(cpumask_iter_failure);
 }
diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
index 0cd4aebb97cf..cdb9dc95e9d9 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_common.h
+++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
@@ -55,6 +55,9 @@  void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
 u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym;
 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym;
 u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
+int bpf_iter_cpumask_new(struct bpf_iter_cpumask *it, const struct cpumask *mask) __ksym;
+int *bpf_iter_cpumask_next(struct bpf_iter_cpumask *it) __ksym;
+void bpf_iter_cpumask_destroy(struct bpf_iter_cpumask *it) __ksym;
 
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
diff --git a/tools/testing/selftests/bpf/progs/cpumask_iter_failure.c b/tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
new file mode 100644
index 000000000000..3d304cee0a72
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cpumask_iter_failure.c
@@ -0,0 +1,99 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "bpf_misc.h"
+#include "task_kfunc_common.h"
+#include "cpumask_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter.s/cgroup")
+__failure __msg("R2 must be a rcu pointer")
+int BPF_PROG(test_cpumask_iter_no_rcu, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct task_struct *p;
+	int *cpu;
+
+	p = bpf_task_from_pid(1);
+	if (!p)
+		return 1;
+
+	bpf_for_each(cpumask, cpu, p->cpus_ptr) {
+	}
+	bpf_task_release(p);
+	return 0;
+}
+
+SEC("iter/cgroup")
+__failure __msg("Possibly NULL pointer passed to trusted arg1")
+int BPF_PROG(test_cpumask_iter_null_pointer, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct cpumask *mask = NULL;
+	int *cpu;
+
+	bpf_for_each(cpumask, cpu, mask) {
+	}
+	return 0;
+}
+
+SEC("iter.s/cgroup")
+__failure __msg("Unreleased reference id=3 alloc_insn=10")
+int BPF_PROG(test_cpumask_iter_no_destroy, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct bpf_iter_cpumask it;
+	struct task_struct *p;
+
+	p = bpf_task_from_pid(1);
+	if (!p)
+		return 1;
+
+	bpf_rcu_read_lock();
+	bpf_iter_cpumask_new(&it, p->cpus_ptr);
+	bpf_rcu_read_unlock();
+
+	bpf_iter_cpumask_next(&it);
+	bpf_task_release(p);
+	return 0;
+}
+
+SEC("iter/cgroup")
+__failure __msg("expected an initialized iter_cpumask as arg #1")
+int BPF_PROG(test_cpumask_iter_next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct bpf_iter_cpumask *it = NULL;
+
+	bpf_iter_cpumask_next(it);
+	return 0;
+}
+
+SEC("iter/cgroup")
+__failure __msg("expected an initialized iter_cpumask as arg #1")
+int BPF_PROG(test_cpumask_iter_next_uninit2, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct bpf_iter_cpumask it = {};
+
+	bpf_iter_cpumask_next(&it);
+	return 0;
+}
+
+SEC("iter/cgroup")
+__failure __msg("expected an initialized iter_cpumask as arg #1")
+int BPF_PROG(test_cpumask_iter_destroy_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct bpf_iter_cpumask_kern it = {.cpu = -1};
+	struct bpf_cpumask *mask;
+
+	mask = bpf_cpumask_create();
+	if (!mask)
+		return 1;
+
+	bpf_cpumask_setall(mask);
+	it.mask = &mask->cpumask;
+	bpf_iter_cpumask_destroy((struct bpf_iter_cpumask *)&it);
+	bpf_cpumask_release(mask);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/cpumask_iter_success.c b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
new file mode 100644
index 000000000000..4ce14ef98451
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cpumask_iter_success.c
@@ -0,0 +1,126 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "task_kfunc_common.h"
+#include "cpumask_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+extern const struct psi_group_cpu system_group_pcpu __ksym __weak;
+extern const struct rq runqueues __ksym __weak;
+
+int pid;
+
+#define READ_PERCPU_DATA(meta, cgrp, mask)							\
+{												\
+	u32 nr_running = 0, psi_nr_running = 0, nr_cpus = 0;					\
+	struct psi_group_cpu *groupc;								\
+	struct rq *rq;										\
+	int *cpu;										\
+												\
+	bpf_for_each(cpumask, cpu, mask) {							\
+		rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, *cpu);				\
+		if (!rq) {									\
+			err += 1;								\
+			continue;								\
+		}										\
+		nr_running += rq->nr_running;							\
+		nr_cpus += 1;									\
+												\
+		groupc = (struct psi_group_cpu *)bpf_per_cpu_ptr(&system_group_pcpu, *cpu);	\
+		if (!groupc) {									\
+			err += 1;								\
+			continue;								\
+		}										\
+		psi_nr_running += groupc->tasks[NR_RUNNING];					\
+	}											\
+	BPF_SEQ_PRINTF(meta->seq, "nr_running %u nr_cpus %u psi_running %u\n",			\
+		       nr_running, nr_cpus, psi_nr_running);					\
+}
+
+SEC("iter.s/cgroup")
+int BPF_PROG(test_cpumask_iter_sleepable, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct task_struct *p;
+
+	/* epilogue */
+	if (!cgrp)
+		return 0;
+
+	bpf_rcu_read_lock();
+	p = bpf_task_from_pid(pid);
+	if (!p) {
+		bpf_rcu_read_unlock();
+		return 1;
+	}
+
+	READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
+	bpf_task_release(p);
+	bpf_rcu_read_unlock();
+	return 0;
+}
+
+SEC("iter/cgroup")
+int BPF_PROG(test_cpumask_iter, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct task_struct *p;
+
+	/* epilogue */
+	if (!cgrp)
+		return 0;
+
+	p = bpf_task_from_pid(pid);
+	if (!p)
+		return 1;
+
+	READ_PERCPU_DATA(meta, cgrp, p->cpus_ptr);
+	bpf_task_release(p);
+	return 0;
+}
+
+SEC("iter.s/cgroup")
+int BPF_PROG(test_cpumask_iter_next_no_rcu, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct bpf_iter_cpumask it;
+	struct task_struct *p;
+
+	p = bpf_task_from_pid(1);
+	if (!p)
+		return 1;
+
+	/* RCU is only required by bpf_iter_cpumask_new(). */
+	bpf_rcu_read_lock();
+	bpf_iter_cpumask_new(&it, p->cpus_ptr);
+	bpf_rcu_read_unlock();
+
+	bpf_iter_cpumask_next(&it);
+	bpf_iter_cpumask_destroy(&it);
+
+	bpf_task_release(p);
+	return 0;
+}
+
+SEC("iter.s/cgroup")
+int BPF_PROG(test_cpumask_iter_no_next, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+{
+	struct bpf_iter_cpumask it;
+	struct task_struct *p;
+
+	p = bpf_task_from_pid(1);
+	if (!p)
+		return 1;
+
+	bpf_rcu_read_lock();
+	bpf_iter_cpumask_new(&it, p->cpus_ptr);
+	bpf_rcu_read_unlock();
+
+	/* It is fine without calling bpf_iter_cpumask_next(). */
+
+	bpf_iter_cpumask_destroy(&it);
+	bpf_task_release(p);
+	return 0;
+}