diff mbox series

[bpf-next] bpf: support deferring bpf_link dealloc to after RCU grace period

Message ID 20240326211427.1156080-1-andrii@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next] bpf: support deferring bpf_link dealloc to after RCU grace period | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7482 this patch: 7482
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers fail 5 blamed authors not CCed: jolsa@kernel.org mhiramat@kernel.org laoar.shao@gmail.com sdf@google.com yonghong.song@linux.dev; 14 maintainers not CCed: song@kernel.org rostedt@goodmis.org mathieu.desnoyers@efficios.com kpsingh@kernel.org laoar.shao@gmail.com linux-trace-kernel@vger.kernel.org yonghong.song@linux.dev eddyz87@gmail.com jolsa@kernel.org mhiramat@kernel.org martin.lau@linux.dev sdf@google.com john.fastabend@gmail.com haoluo@google.com
netdev/build_clang success Errors and warnings before: 2244 this patch: 2244
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 7870 this patch: 7870
netdev/checkpatch warning WARNING: line length of 93 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Andrii Nakryiko March 26, 2024, 9:14 p.m. UTC
BPF link for some program types is passed as a "context" which can be
used by those BPF programs to look up additional information. E.g., for
BPF raw tracepoints, link is used to fetch BPF cookie value, similarly
for BPF multi-kprobes and multi-uprobes.

Because of this runtime dependency, when bpf_link refcnt drops to zero
there could still be active BPF programs running accessing link data
(cookie, program pointer, etc).

This patch adds generic support to defer bpf_link dealloc callback to
after RCU GP, if requested. This is done by exposing two different
deallocation callbacks, one synchronous and one deferred. If deferred
one is provided, bpf_link_free() will schedule dealloc_deferred()
callback to happen after RCU GP.

BPF is using two flavors of RCU: "classic" non-sleepable one and RCU
tasks trace one. The latter is used when sleepable BPF programs are
used. bpf_link_free() accommodates that by checking underlying BPF
program's sleepable flag, and goes either through normal RCU GP only for
non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP
(taking into account rcu_trace_implies_rcu_gp() optimization), if BPF
program is sleepable.

We use this for raw tracepoint, multi-kprobe, and multi-uprobe links,
all of which dereference link during program run.

Fixes: d4dfc5700e86 ("bpf: pass whole link instead of prog when triggering raw tracepoint")
Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com
Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com
Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h      | 16 +++++++++++++++-
 kernel/bpf/syscall.c     | 35 ++++++++++++++++++++++++++++++++---
 kernel/trace/bpf_trace.c |  4 ++--
 3 files changed, 49 insertions(+), 6 deletions(-)

Comments

Jiri Olsa March 28, 2024, 10:10 a.m. UTC | #1
On Tue, Mar 26, 2024 at 02:14:27PM -0700, Andrii Nakryiko wrote:
> BPF link for some program types is passed as a "context" which can be
> used by those BPF programs to look up additional information. E.g., for
> BPF raw tracepoints, link is used to fetch BPF cookie value, similarly
> for BPF multi-kprobes and multi-uprobes.
> 
> Because of this runtime dependency, when bpf_link refcnt drops to zero
> there could still be active BPF programs running accessing link data
> (cookie, program pointer, etc).
> 
> This patch adds generic support to defer bpf_link dealloc callback to
> after RCU GP, if requested. This is done by exposing two different
> deallocation callbacks, one synchronous and one deferred. If deferred
> one is provided, bpf_link_free() will schedule dealloc_deferred()
> callback to happen after RCU GP.
> 
> BPF is using two flavors of RCU: "classic" non-sleepable one and RCU
> tasks trace one. The latter is used when sleepable BPF programs are
> used. bpf_link_free() accommodates that by checking underlying BPF
> program's sleepable flag, and goes either through normal RCU GP only for
> non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP
> (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF
> program is sleepable.
> 
> We use this for raw tracepoint, multi-kprobe, and multi-uprobe links,
> all of which dereference link during program run.
> 
> Fixes: d4dfc5700e86 ("bpf: pass whole link instead of prog when triggering raw tracepoint")
> Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
> Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
> Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com
> Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com
> Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>

Acked-by: Jiri Olsa <jolsa@kernel.org>

jirka

> ---
>  include/linux/bpf.h      | 16 +++++++++++++++-
>  kernel/bpf/syscall.c     | 35 ++++++++++++++++++++++++++++++++---
>  kernel/trace/bpf_trace.c |  4 ++--
>  3 files changed, 49 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 62762390c93d..e52d5b3ee45e 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1573,12 +1573,26 @@ struct bpf_link {
>  	enum bpf_link_type type;
>  	const struct bpf_link_ops *ops;
>  	struct bpf_prog *prog;
> -	struct work_struct work;
> +	/* rcu is used before freeing, work can be used to schedule that
> +	 * RCU-based freeing before that, so they never overlap
> +	 */
> +	union {
> +		struct rcu_head rcu;
> +		struct work_struct work;
> +	};
>  };
>  
>  struct bpf_link_ops {
>  	void (*release)(struct bpf_link *link);
> +	/* deallocate link resources callback, called without RCU grace period
> +	 * waiting
> +	 */
>  	void (*dealloc)(struct bpf_link *link);
> +	/* deallocate link resources callback, called after RCU grace period;
> +	 * if underlying BPF program is sleepable we go through tasks trace
> +	 * RCU GP and then "classic" RCU GP
> +	 */
> +	void (*dealloc_deferred)(struct bpf_link *link);
>  	int (*detach)(struct bpf_link *link);
>  	int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
>  			   struct bpf_prog *old_prog);
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e44c276e8617..c0f2f052a02c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link)
>  	atomic64_inc(&link->refcnt);
>  }
>  
> +static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
> +{
> +	struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
> +
> +	/* free bpf_link and its containing memory */
> +	link->ops->dealloc_deferred(link);
> +}
> +
> +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
> +{
> +	if (rcu_trace_implies_rcu_gp())
> +		bpf_link_defer_dealloc_rcu_gp(rcu);
> +	else
> +		call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
> +}
> +
>  /* bpf_link_free is guaranteed to be called from process context */
>  static void bpf_link_free(struct bpf_link *link)
>  {
> +	bool sleepable = false;
> +
>  	bpf_link_free_id(link->id);
>  	if (link->prog) {
> +		sleepable = link->prog->sleepable;
>  		/* detach BPF program, clean up used resources */
>  		link->ops->release(link);
>  		bpf_prog_put(link->prog);
>  	}
> -	/* free bpf_link and its containing memory */
> -	link->ops->dealloc(link);
> +	if (link->ops->dealloc_deferred) {
> +		/* schedule BPF link deallocation; if underlying BPF program
> +		 * is sleepable, we need to first wait for RCU tasks trace
> +		 * sync, then go through "classic" RCU grace period
> +		 */
> +		if (sleepable)
> +			call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
> +		else
> +			call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
> +	}
> +	if (link->ops->dealloc)
> +		link->ops->dealloc(link);
>  }
>  
>  static void bpf_link_put_deferred(struct work_struct *work)
> @@ -3539,7 +3568,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
>  
>  static const struct bpf_link_ops bpf_raw_tp_link_lops = {
>  	.release = bpf_raw_tp_link_release,
> -	.dealloc = bpf_raw_tp_link_dealloc,
> +	.dealloc_deferred = bpf_raw_tp_link_dealloc,
>  	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
>  	.fill_link_info = bpf_raw_tp_link_fill_link_info,
>  };
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 6d0c95638e1b..98eacfacb73a 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -2740,7 +2740,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
>  
>  static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
>  	.release = bpf_kprobe_multi_link_release,
> -	.dealloc = bpf_kprobe_multi_link_dealloc,
> +	.dealloc_deferred = bpf_kprobe_multi_link_dealloc,
>  	.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
>  };
>  
> @@ -3254,7 +3254,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
>  
>  static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
>  	.release = bpf_uprobe_multi_link_release,
> -	.dealloc = bpf_uprobe_multi_link_dealloc,
> +	.dealloc_deferred = bpf_uprobe_multi_link_dealloc,
>  	.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
>  };
>  
> -- 
> 2.43.0
> 
>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 62762390c93d..e52d5b3ee45e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1573,12 +1573,26 @@  struct bpf_link {
 	enum bpf_link_type type;
 	const struct bpf_link_ops *ops;
 	struct bpf_prog *prog;
-	struct work_struct work;
+	/* rcu is used before freeing, work can be used to schedule that
+	 * RCU-based freeing before that, so they never overlap
+	 */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
 };
 
 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
+	/* deallocate link resources callback, called without RCU grace period
+	 * waiting
+	 */
 	void (*dealloc)(struct bpf_link *link);
+	/* deallocate link resources callback, called after RCU grace period;
+	 * if underlying BPF program is sleepable we go through tasks trace
+	 * RCU GP and then "classic" RCU GP
+	 */
+	void (*dealloc_deferred)(struct bpf_link *link);
 	int (*detach)(struct bpf_link *link);
 	int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
 			   struct bpf_prog *old_prog);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e44c276e8617..c0f2f052a02c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3024,17 +3024,46 @@  void bpf_link_inc(struct bpf_link *link)
 	atomic64_inc(&link->refcnt);
 }
 
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
+{
+	struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
+
+	/* free bpf_link and its containing memory */
+	link->ops->dealloc_deferred(link);
+}
+
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+{
+	if (rcu_trace_implies_rcu_gp())
+		bpf_link_defer_dealloc_rcu_gp(rcu);
+	else
+		call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
+
 /* bpf_link_free is guaranteed to be called from process context */
 static void bpf_link_free(struct bpf_link *link)
 {
+	bool sleepable = false;
+
 	bpf_link_free_id(link->id);
 	if (link->prog) {
+		sleepable = link->prog->sleepable;
 		/* detach BPF program, clean up used resources */
 		link->ops->release(link);
 		bpf_prog_put(link->prog);
 	}
-	/* free bpf_link and its containing memory */
-	link->ops->dealloc(link);
+	if (link->ops->dealloc_deferred) {
+		/* schedule BPF link deallocation; if underlying BPF program
+		 * is sleepable, we need to first wait for RCU tasks trace
+		 * sync, then go through "classic" RCU grace period
+		 */
+		if (sleepable)
+			call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+		else
+			call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+	}
+	if (link->ops->dealloc)
+		link->ops->dealloc(link);
 }
 
 static void bpf_link_put_deferred(struct work_struct *work)
@@ -3539,7 +3568,7 @@  static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
 
 static const struct bpf_link_ops bpf_raw_tp_link_lops = {
 	.release = bpf_raw_tp_link_release,
-	.dealloc = bpf_raw_tp_link_dealloc,
+	.dealloc_deferred = bpf_raw_tp_link_dealloc,
 	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
 	.fill_link_info = bpf_raw_tp_link_fill_link_info,
 };
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6d0c95638e1b..98eacfacb73a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2740,7 +2740,7 @@  static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
 
 static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
 	.release = bpf_kprobe_multi_link_release,
-	.dealloc = bpf_kprobe_multi_link_dealloc,
+	.dealloc_deferred = bpf_kprobe_multi_link_dealloc,
 	.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
 };
 
@@ -3254,7 +3254,7 @@  static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
 
 static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
 	.release = bpf_uprobe_multi_link_release,
-	.dealloc = bpf_uprobe_multi_link_dealloc,
+	.dealloc_deferred = bpf_uprobe_multi_link_dealloc,
 	.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
 };