diff mbox series

[bpf-next,v2] Add notrace to queued_spin_lock_slowpath

Message ID 20240421234336.542607-1-sidchintamaneni@vt.edu (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next,v2] Add notrace to queued_spin_lock_slowpath | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 945 this patch: 945
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers fail 1 blamed authors not CCed: peterz@infradead.org; 16 maintainers not CCed: will@kernel.org eddyz87@gmail.com linux-kselftest@vger.kernel.org boqun.feng@gmail.com haoluo@google.com kpsingh@kernel.org john.fastabend@gmail.com shuah@kernel.org peterz@infradead.org jolsa@kernel.org longman@redhat.com song@kernel.org martin.lau@linux.dev mykolal@fb.com mingo@redhat.com sdf@google.com
netdev/build_clang success Errors and warnings before: 955 this patch: 955
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 956 this patch: 956
netdev/checkpatch warning WARNING: From:/Signed-off-by: email address mismatch: 'From: Siddharth Chintamaneni <sidchintamaneni@gmail.com>' != 'Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@vt.edu>' WARNING: line length of 82 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Siddharth Chintamaneni April 21, 2024, 11:43 p.m. UTC
This patch is to prevent deadlocks when multiple bpf
programs are attached to queued_spin_locks functions. This issue is similar
to what is already discussed[1] before with the spin_lock helpers.

The addition of notrace macro to the queued_spin_locks
has been discussed[2] when bpf_spin_locks are introduced.

[1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/#r
[2] https://lore.kernel.org/all/20190117011629.efxp7abj4bpf5yco@ast-mbp/t/#maf05c4d71f935f3123013b7ed410e4f50e9da82c

Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock")
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@vt.edu>
---
 kernel/locking/qspinlock.c                    |  2 +-
 .../bpf/prog_tests/tracing_failure.c          | 24 +++++++++++++++++++
 .../selftests/bpf/progs/tracing_failure.c     |  6 +++++
 3 files changed, 31 insertions(+), 1 deletion(-)

Comments

Jiri Olsa April 22, 2024, 7:47 a.m. UTC | #1
On Sun, Apr 21, 2024 at 07:43:36PM -0400, Siddharth Chintamaneni wrote:
> This patch is to prevent deadlocks when multiple bpf
> programs are attached to queued_spin_locks functions. This issue is similar
> to what is already discussed[1] before with the spin_lock helpers.
> 
> The addition of notrace macro to the queued_spin_locks
> has been discussed[2] when bpf_spin_locks are introduced.
> 
> [1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/#r
> [2] https://lore.kernel.org/all/20190117011629.efxp7abj4bpf5yco@ast-mbp/t/#maf05c4d71f935f3123013b7ed410e4f50e9da82c
> 
> Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock")
> Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@vt.edu>
> ---
>  kernel/locking/qspinlock.c                    |  2 +-
>  .../bpf/prog_tests/tracing_failure.c          | 24 +++++++++++++++++++
>  .../selftests/bpf/progs/tracing_failure.c     |  6 +++++
>  3 files changed, 31 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index ebe6b8ec7cb3..4d46538d8399 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -313,7 +313,7 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
>   * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
>   *   queue               :         ^--'                             :
>   */
> -void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> +notrace void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)

we did the same for bpf spin lock helpers, which is fine, but I wonder
removing queued_spin_lock_slowpath from traceable functions could break
some scripts (even though many probably use contention tracepoints..)

maybe we could have a list of helpers/kfuncs that could call spin lock
and deny bpf program to load/attach to queued_spin_lock_slowpath
if it calls anything from that list

>  {
>  	struct mcs_spinlock *prev, *next, *node;
>  	u32 old, tail;
> diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
> index a222df765bc3..822ee6c559bc 100644
> --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
> +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
> @@ -28,10 +28,34 @@ static void test_bpf_spin_lock(bool is_spin_lock)
>  	tracing_failure__destroy(skel);
>  }
>  
> +static void test_queued_spin_lock(void)
> +{
> +	struct tracing_failure *skel;
> +	int err;
> +
> +	skel = tracing_failure__open();
> +	if (!ASSERT_OK_PTR(skel, "tracing_failure__open"))
> +		return;
> +
> +	bpf_program__set_autoload(skel->progs.test_queued_spin_lock, true);
> +
> +	err = tracing_failure__load(skel);
> +	if (!ASSERT_OK(err, "tracing_failure__load"))
> +		goto out;
> +
> +	err = tracing_failure__attach(skel);
> +	ASSERT_ERR(err, "tracing_failure__attach");

the test is broken, fentry program won't load with notrace function

[root@qemu bpf]# ./test_progs -n 391/3
test_queued_spin_lock:PASS:tracing_failure__open 0 nsec
libbpf: prog 'test_queued_spin_lock': failed to find kernel BTF type ID of 'queued_spin_lock_slowpath': -3
libbpf: prog 'test_queued_spin_lock': failed to prepare load attributes: -3
libbpf: prog 'test_queued_spin_lock': failed to load: -3
libbpf: failed to load object 'tracing_failure'
libbpf: failed to load BPF skeleton 'tracing_failure': -3
test_queued_spin_lock:FAIL:tracing_failure__load unexpected error: -3 (errno 3)
#391/3   tracing_failure/queued_spin_lock_slowpath:FAIL
#391     tracing_failure:FAIL

jirka

> +
> +out:
> +	tracing_failure__destroy(skel);
> +}
> +
>  void test_tracing_failure(void)
>  {
>  	if (test__start_subtest("bpf_spin_lock"))
>  		test_bpf_spin_lock(true);
>  	if (test__start_subtest("bpf_spin_unlock"))
>  		test_bpf_spin_lock(false);
> +	if (test__start_subtest("queued_spin_lock_slowpath"))
> +		test_queued_spin_lock();
>  }
> diff --git a/tools/testing/selftests/bpf/progs/tracing_failure.c b/tools/testing/selftests/bpf/progs/tracing_failure.c
> index d41665d2ec8c..2d2e7fc9d4f0 100644
> --- a/tools/testing/selftests/bpf/progs/tracing_failure.c
> +++ b/tools/testing/selftests/bpf/progs/tracing_failure.c
> @@ -18,3 +18,9 @@ int BPF_PROG(test_spin_unlock, struct bpf_spin_lock *lock)
>  {
>  	return 0;
>  }
> +
> +SEC("?fentry/queued_spin_lock_slowpath")
> +int BPF_PROG(test_queued_spin_lock, struct qspinlock *lock, u32 val)
> +{
> +	return 0;
> +}
> -- 
> 2.43.0
>
Alexei Starovoitov April 22, 2024, 5:13 p.m. UTC | #2
On Mon, Apr 22, 2024 at 12:47 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Sun, Apr 21, 2024 at 07:43:36PM -0400, Siddharth Chintamaneni wrote:
> > This patch is to prevent deadlocks when multiple bpf
> > programs are attached to queued_spin_locks functions. This issue is similar
> > to what is already discussed[1] before with the spin_lock helpers.
> >
> > The addition of notrace macro to the queued_spin_locks
> > has been discussed[2] when bpf_spin_locks are introduced.
> >
> > [1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/#r
> > [2] https://lore.kernel.org/all/20190117011629.efxp7abj4bpf5yco@ast-mbp/t/#maf05c4d71f935f3123013b7ed410e4f50e9da82c
> >
> > Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock")
> > Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@vt.edu>
> > ---
> >  kernel/locking/qspinlock.c                    |  2 +-
> >  .../bpf/prog_tests/tracing_failure.c          | 24 +++++++++++++++++++
> >  .../selftests/bpf/progs/tracing_failure.c     |  6 +++++
> >  3 files changed, 31 insertions(+), 1 deletion(-)
> >
> > diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> > index ebe6b8ec7cb3..4d46538d8399 100644
> > --- a/kernel/locking/qspinlock.c
> > +++ b/kernel/locking/qspinlock.c
> > @@ -313,7 +313,7 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
> >   * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
> >   *   queue               :         ^--'                             :
> >   */
> > -void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> > +notrace void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>
> we did the same for bpf spin lock helpers, which is fine, but I wonder
> removing queued_spin_lock_slowpath from traceable functions could break
> some scripts (even though many probably use contention tracepoints..)
>
> maybe we could have a list of helpers/kfuncs that could call spin lock
> and deny bpf program to load/attach to queued_spin_lock_slowpath
> if it calls anything from that list

We can filter out many such functions, but the possibility of deadlock
will still exist.
Adding notrace here won't help much,
since there are tracepoints in there: trace_contention_begin/end
which are quite useful and we should still allow bpf to use them.
I think the only bullet proof way is to detect deadlocks at runtime.
I'm working on such "try hard to spin_lock but abort if it deadlocks."
Siddharth Chintamaneni April 22, 2024, 11:20 p.m. UTC | #3
On Mon, 22 Apr 2024 at 13:13, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Apr 22, 2024 at 12:47 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Sun, Apr 21, 2024 at 07:43:36PM -0400, Siddharth Chintamaneni wrote:
> > > This patch is to prevent deadlocks when multiple bpf
> > > programs are attached to queued_spin_locks functions. This issue is similar
> > > to what is already discussed[1] before with the spin_lock helpers.
> > >
> > > The addition of notrace macro to the queued_spin_locks
> > > has been discussed[2] when bpf_spin_locks are introduced.
> > >
> > > [1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/#r
> > > [2] https://lore.kernel.org/all/20190117011629.efxp7abj4bpf5yco@ast-mbp/t/#maf05c4d71f935f3123013b7ed410e4f50e9da82c
> > >
> > > Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock")
> > > Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@vt.edu>
> > > ---
> > >  kernel/locking/qspinlock.c                    |  2 +-
> > >  .../bpf/prog_tests/tracing_failure.c          | 24 +++++++++++++++++++
> > >  .../selftests/bpf/progs/tracing_failure.c     |  6 +++++
> > >  3 files changed, 31 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> > > index ebe6b8ec7cb3..4d46538d8399 100644
> > > --- a/kernel/locking/qspinlock.c
> > > +++ b/kernel/locking/qspinlock.c
> > > @@ -313,7 +313,7 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
> > >   * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
> > >   *   queue               :         ^--'                             :
> > >   */
> > > -void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> > > +notrace void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> >
> > we did the same for bpf spin lock helpers, which is fine, but I wonder
> > removing queued_spin_lock_slowpath from traceable functions could break
> > some scripts (even though many probably use contention tracepoints..)
> >
> > maybe we could have a list of helpers/kfuncs that could call spin lock
> > and deny bpf program to load/attach to queued_spin_lock_slowpath
> > if it calls anything from that list
>
> We can filter out many such functions, but the possibility of deadlock
> will still exist.
> Adding notrace here won't help much,
> since there are tracepoints in there: trace_contention_begin/end
> which are quite useful and we should still allow bpf to use them.
> I think the only bullet proof way is to detect deadlocks at runtime.
> I'm working on such "try hard to spin_lock but abort if it deadlocks."

I agree with the point that notracing all the functions will not
resolve the issue. I could also find a scenario where BPF programs
will end up in a deadlock easily by using bpf_map_pop_elem and
bpf_map_push_elem helper functions called from two different BPF
programs accessing the same map. Here are some issues raised by syzbot
[2, 3].

 I also believe that a BPF program can end up in a deadlock scenario
without any assistance from the second BPF program, like described
above. The runtime solution sounds like a better fit to address this
problem, unless there is a BPF program that should definitely run for
the performance or security of the system (like an LSM hook or a
nested scheduling type program as mentioned here [1]).

In those cases, the user assumes that these BPF programs will always
trigger. So, to address these types of issues, we are currently
working on a helper's function callgraph based approach so that the
verifier gets the ability to make a decision during load time on
whether to load it or not, ensuring that if a BPF program is attached,
it will be triggered.

[1] https://lore.kernel.org/all/a15f6a20-c902-4057-a1a9-8259a225bb8b@linux.dev/
[2] https://lore.kernel.org/lkml/0000000000004aa700061379547e@google.com/
[3] https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@google.com/

PS. We are following a similar approach to solve the stackoverflow
problem with nesting.

Thanks,
Siddharth
Alexei Starovoitov April 22, 2024, 11:47 p.m. UTC | #4
On Mon, Apr 22, 2024 at 4:20 PM Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> On Mon, 22 Apr 2024 at 13:13, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Apr 22, 2024 at 12:47 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > >
> > > On Sun, Apr 21, 2024 at 07:43:36PM -0400, Siddharth Chintamaneni wrote:
> > > > This patch is to prevent deadlocks when multiple bpf
> > > > programs are attached to queued_spin_locks functions. This issue is similar
> > > > to what is already discussed[1] before with the spin_lock helpers.
> > > >
> > > > The addition of notrace macro to the queued_spin_locks
> > > > has been discussed[2] when bpf_spin_locks are introduced.
> > > >
> > > > [1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/#r
> > > > [2] https://lore.kernel.org/all/20190117011629.efxp7abj4bpf5yco@ast-mbp/t/#maf05c4d71f935f3123013b7ed410e4f50e9da82c
> > > >
> > > > Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock")
> > > > Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@vt.edu>
> > > > ---
> > > >  kernel/locking/qspinlock.c                    |  2 +-
> > > >  .../bpf/prog_tests/tracing_failure.c          | 24 +++++++++++++++++++
> > > >  .../selftests/bpf/progs/tracing_failure.c     |  6 +++++
> > > >  3 files changed, 31 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> > > > index ebe6b8ec7cb3..4d46538d8399 100644
> > > > --- a/kernel/locking/qspinlock.c
> > > > +++ b/kernel/locking/qspinlock.c
> > > > @@ -313,7 +313,7 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
> > > >   * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
> > > >   *   queue               :         ^--'                             :
> > > >   */
> > > > -void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> > > > +notrace void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
> > >
> > > we did the same for bpf spin lock helpers, which is fine, but I wonder
> > > removing queued_spin_lock_slowpath from traceable functions could break
> > > some scripts (even though many probably use contention tracepoints..)
> > >
> > > maybe we could have a list of helpers/kfuncs that could call spin lock
> > > and deny bpf program to load/attach to queued_spin_lock_slowpath
> > > if it calls anything from that list
> >
> > We can filter out many such functions, but the possibility of deadlock
> > will still exist.
> > Adding notrace here won't help much,
> > since there are tracepoints in there: trace_contention_begin/end
> > which are quite useful and we should still allow bpf to use them.
> > I think the only bullet proof way is to detect deadlocks at runtime.
> > I'm working on such "try hard to spin_lock but abort if it deadlocks."
>
> I agree with the point that notracing all the functions will not
> resolve the issue. I could also find a scenario where BPF programs
> will end up in a deadlock easily by using bpf_map_pop_elem and
> bpf_map_push_elem helper functions called from two different BPF
> programs accessing the same map. Here are some issues raised by syzbot
> [2, 3].

ringbuf and stackqueue maps should probably be fixed now
similar to hashmap's __this_cpu_inc_return(*(htab->map_locked...)
approach.
Both ringbug and queue_stack can handle failure to lock.
That will address the issue spotted by these 2 syzbot reports.
Could you work on such patches?

The full run-time solution will take time to land and
may be too big to be backportable.
I'll certainly cc you on the patches when I send them.

>  I also believe that a BPF program can end up in a deadlock scenario
> without any assistance from the second BPF program, like described
> above. The runtime solution sounds like a better fit to address this
> problem, unless there is a BPF program that should definitely run for
> the performance or security of the system (like an LSM hook or a
> nested scheduling type program as mentioned here [1]).

Right. Certain bpf progs like tcp-bpf don't even have a recursion
run-time counter, because they have to nest and it's safe to nest.

> In those cases, the user assumes that these BPF programs will always
> trigger. So, to address these types of issues, we are currently
> working on a helper's function callgraph based approach so that the
> verifier gets the ability to make a decision during load time on
> whether to load it or not, ensuring that if a BPF program is attached,
> it will be triggered.

callgraph approach? Could you share more details?

>
> [1] https://lore.kernel.org/all/a15f6a20-c902-4057-a1a9-8259a225bb8b@linux.dev/
> [2] https://lore.kernel.org/lkml/0000000000004aa700061379547e@google.com/
> [3] https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@google.com/
>
> PS. We are following a similar approach to solve the stackoverflow
> problem with nesting.

Not following. The stack overflow issue is being fixed by not using
the kernel stack. So each bpf prog will consume a tiny bit of stack
to save frame pointer, return address, and callee regs.
Siddharth Chintamaneni April 23, 2024, 11:58 p.m. UTC | #5
> > I agree with the point that notracing all the functions will not
> > resolve the issue. I could also find a scenario where BPF programs
> > will end up in a deadlock easily by using bpf_map_pop_elem and
> > bpf_map_push_elem helper functions called from two different BPF
> > programs accessing the same map. Here are some issues raised by syzbot
> > [2, 3].
>
> ringbuf and stackqueue maps should probably be fixed now
> similar to hashmap's __this_cpu_inc_return(*(htab->map_locked...)
> approach.
> Both ringbug and queue_stack can handle failure to lock.
> That will address the issue spotted by these 2 syzbot reports.
> Could you work on such patches?

Just seen your latest patches related to this. Yes, I will work on the
fixes and send the patches.

> > In those cases, the user assumes that these BPF programs will always
> > trigger. So, to address these types of issues, we are currently
> > working on a helper's function callgraph based approach so that the
> > verifier gets the ability to make a decision during load time on
> > whether to load it or not, ensuring that if a BPF program is attached,
> > it will be triggered.
>
> callgraph approach? Could you share more details?

We are generating a call graph for all the helper functions (including
the indirect call targets) and trying to filter out the functions and
their callee's which take locks. So if any BPF program tries to attach
to these lock-taking functions and contains these lock-taking
functions inside the helper it is calling, we want to reject it at
load time. This type of approach may lead to many false positives, but
it will adhere to the principle that "If a BPF program is attached,
then it should get triggered as expected without affecting the
kernel." We are planning to work towards this solution and would love
your feedback on it.

> Not following. The stack overflow issue is being fixed by not using
> the kernel stack. So each bpf prog will consume a tiny bit of stack
> to save frame pointer, return address, and callee regs.

(IIRC), in the email chain, it is mentioned that BPF programs are
going to use a new stack allocated from the heap. I think with a
deeper call chain of fentry BPF programs, isn't it still a possibility
to overflow the stack? Also, through our call graph analysis, we found
that some helpers have really deeper call depths. If a BPF program is
attached at the deepest point in the helper's call chain, isn't there
still a possibility to overflow it? In LPC '23 [1], we presented a
similar idea of stack switching to prevent the overflow in nesting and
later realized that it may give you some extra space and the ability
to nest more, but it is not entirely resolving the issue (tail calls
will even worsen this issue).


[1] https://lpc.events/event/17/contributions/1595/attachments/1230/2506/LPC2023_slides.pdf
Alexei Starovoitov April 24, 2024, 6:45 p.m. UTC | #6
On Tue, Apr 23, 2024 at 4:58 PM Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> > > I agree with the point that notracing all the functions will not
> > > resolve the issue. I could also find a scenario where BPF programs
> > > will end up in a deadlock easily by using bpf_map_pop_elem and
> > > bpf_map_push_elem helper functions called from two different BPF
> > > programs accessing the same map. Here are some issues raised by syzbot
> > > [2, 3].
> >
> > ringbuf and stackqueue maps should probably be fixed now
> > similar to hashmap's __this_cpu_inc_return(*(htab->map_locked...)
> > approach.
> > Both ringbug and queue_stack can handle failure to lock.
> > That will address the issue spotted by these 2 syzbot reports.
> > Could you work on such patches?
>
> Just seen your latest patches related to this. Yes, I will work on the
> fixes and send the patches.

Great.

> > > In those cases, the user assumes that these BPF programs will always
> > > trigger. So, to address these types of issues, we are currently
> > > working on a helper's function callgraph based approach so that the
> > > verifier gets the ability to make a decision during load time on
> > > whether to load it or not, ensuring that if a BPF program is attached,
> > > it will be triggered.
> >
> > callgraph approach? Could you share more details?
>
> We are generating a call graph for all the helper functions (including
> the indirect call targets) and trying to filter out the functions and
> their callee's which take locks. So if any BPF program tries to attach
> to these lock-taking functions and contains these lock-taking
> functions inside the helper it is calling, we want to reject it at
> load time. This type of approach may lead to many false positives, but
> it will adhere to the principle that "If a BPF program is attached,
> then it should get triggered as expected without affecting the
> kernel." We are planning to work towards this solution and would love
> your feedback on it.

I can see how you can build a cfg across bpf progs and helpers/kfuncs
that they call, but not across arbitrary attach points in the kernel.
So attaching to qspinlock_slowpath or some tracepoint won't be
recognized in such callgraph. Or am I missing it?

In the past we floated an idea to dual compile the kernel.
Once for native and once for bpf isa, so that all of the kernel code
is analyzable. But there was no strong enough use case to do it.

> > Not following. The stack overflow issue is being fixed by not using
> > the kernel stack. So each bpf prog will consume a tiny bit of stack
> > to save frame pointer, return address, and callee regs.
>
> (IIRC), in the email chain, it is mentioned that BPF programs are
> going to use a new stack allocated from the heap. I think with a
> deeper call chain of fentry BPF programs, isn't it still a possibility
> to overflow the stack?

stack overflow is a possibility even without bpf. That's why the stack
is now virtually mapped with guard pages.

> Also, through our call graph analysis, we found
> that some helpers have really deeper call depths. If a BPF program is
> attached at the deepest point in the helper's call chain, isn't there
> still a possibility to overflow it? In LPC '23 [1], we presented a
> similar idea of stack switching to prevent the overflow in nesting and
> later realized that it may give you some extra space and the ability
> to nest more, but it is not entirely resolving the issue (tail calls
> will even worsen this issue).

The problem with any kind of stack switching is reliability of stack
traces. Kernel panics must be able to walk the stack. Even when
there are bugs. Full stack switch makes it risky.
Then livepatching depends on reliable stack walks.
That's another reason to avoid stack switch.

>
> [1] https://lpc.events/event/17/contributions/1595/attachments/1230/2506/LPC2023_slides.pdf
diff mbox series

Patch

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ebe6b8ec7cb3..4d46538d8399 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -313,7 +313,7 @@  static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
  * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
  *   queue               :         ^--'                             :
  */
-void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+notrace void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
 	struct mcs_spinlock *prev, *next, *node;
 	u32 old, tail;
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
index a222df765bc3..822ee6c559bc 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
@@ -28,10 +28,34 @@  static void test_bpf_spin_lock(bool is_spin_lock)
 	tracing_failure__destroy(skel);
 }
 
+static void test_queued_spin_lock(void)
+{
+	struct tracing_failure *skel;
+	int err;
+
+	skel = tracing_failure__open();
+	if (!ASSERT_OK_PTR(skel, "tracing_failure__open"))
+		return;
+
+	bpf_program__set_autoload(skel->progs.test_queued_spin_lock, true);
+
+	err = tracing_failure__load(skel);
+	if (!ASSERT_OK(err, "tracing_failure__load"))
+		goto out;
+
+	err = tracing_failure__attach(skel);
+	ASSERT_ERR(err, "tracing_failure__attach");
+
+out:
+	tracing_failure__destroy(skel);
+}
+
 void test_tracing_failure(void)
 {
 	if (test__start_subtest("bpf_spin_lock"))
 		test_bpf_spin_lock(true);
 	if (test__start_subtest("bpf_spin_unlock"))
 		test_bpf_spin_lock(false);
+	if (test__start_subtest("queued_spin_lock_slowpath"))
+		test_queued_spin_lock();
 }
diff --git a/tools/testing/selftests/bpf/progs/tracing_failure.c b/tools/testing/selftests/bpf/progs/tracing_failure.c
index d41665d2ec8c..2d2e7fc9d4f0 100644
--- a/tools/testing/selftests/bpf/progs/tracing_failure.c
+++ b/tools/testing/selftests/bpf/progs/tracing_failure.c
@@ -18,3 +18,9 @@  int BPF_PROG(test_spin_unlock, struct bpf_spin_lock *lock)
 {
 	return 0;
 }
+
+SEC("?fentry/queued_spin_lock_slowpath")
+int BPF_PROG(test_queued_spin_lock, struct qspinlock *lock, u32 val)
+{
+	return 0;
+}