diff mbox series

[bpf-next,11/18] bpf: wq: add bpf_wq_init

Message ID 20240416-bpf_wq-v1-11-c9e66092f842@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Introduce bpf_wq | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
netdev/series_format fail Series longer than 15 patches (and no cover letter)
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 7516 this patch: 7518
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 1228 this patch: 1228
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 7898 this patch: 7900
netdev/checkpatch warning WARNING: line length of 83 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 7 this patch: 7
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18

Commit Message

Benjamin Tissoires April 16, 2024, 2:08 p.m. UTC
We need to teach the verifier about the second argument which is declared
as void * but which is of type KF_ARG_PTR_TO_MAP. We could have dropped
this extra case if we declared the second argument as struct bpf_map *,
but that means users will have to do extra casting to have their program
compile.

We also need to duplicate the timer code for the checking if the map
argument is matching the provided workqueue.

Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>

---

Note that the timer code when matching for the map is checking for
constant map pointers. I wonder if this needs to be enforced too
(being constant?)
---
 include/uapi/linux/bpf.h |   9 ++++
 kernel/bpf/helpers.c     | 114 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c    |   6 +++
 3 files changed, 127 insertions(+), 2 deletions(-)

Comments

Alexei Starovoitov April 19, 2024, 5:25 a.m. UTC | #1
On Tue, Apr 16, 2024 at 04:08:24PM +0200, Benjamin Tissoires wrote:
> We need to teach the verifier about the second argument which is declared
> as void * but which is of type KF_ARG_PTR_TO_MAP. We could have dropped
> this extra case if we declared the second argument as struct bpf_map *,
> but that means users will have to do extra casting to have their program
> compile.
> 
> We also need to duplicate the timer code for the checking if the map
> argument is matching the provided workqueue.
> 
> Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
> 
> ---
> 
> Note that the timer code when matching for the map is checking for
> constant map pointers. I wonder if this needs to be enforced too
> (being constant?)
> ---
>  include/uapi/linux/bpf.h |   9 ++++
>  kernel/bpf/helpers.c     | 114 ++++++++++++++++++++++++++++++++++++++++++++++-
>  kernel/bpf/verifier.c    |   6 +++
>  3 files changed, 127 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e4ae83550fb3..519f6019d158 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7502,4 +7502,13 @@ struct bpf_iter_num {
>  	__u64 __opaque[1];
>  } __attribute__((aligned(8)));
>  
> +/*
> + * Flags to control bpf_wq_init() and bpf_wq_set_callback() behaviour.
> + *     - BPF_F_WQ_SLEEPABLE: the callback needs to run in
> + *       a sleepable context
> + */
> +enum {
> +	BPF_F_WQ_SLEEPABLE = (1ULL << 0),
> +};

Just started looking at the patch set. The first reaction that
this flag is odd. Why add it? wq provides sleepable ctx.
Why would the program ask to be non-sleepable in wq?
If it needs to callback to run in rcu cs, it can use bpf_rcu_read_lock() kfunc
as the first call in such callback and it will be equivalent
to not-passing this BPF_F_WQ_SLEEPABLE flag.
It seem it can be dropped and complexity reduced.
The verifier complications in later patches due to this flag too...
I just don't see the point.

> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 9fd12d480b8b..9ac1b8bb3a01 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1109,11 +1109,18 @@ struct bpf_hrtimer {
>  	struct hrtimer timer;
>  };
>  
> -/* the actual struct hidden inside uapi struct bpf_timer */
> +struct bpf_work {
> +	struct bpf_async_cb cb;
> +	struct work_struct work;
> +	struct work_struct delete_work;
> +};
> +
> +/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
>  struct bpf_async_kern {
>  	union {
>  		struct bpf_async_cb *cb;
>  		struct bpf_hrtimer *timer;
> +		struct bpf_work *work;
>  	};
>  	/* bpf_spin_lock is used here instead of spinlock_t to make
>  	 * sure that it always fits into space reserved by struct bpf_timer
> @@ -1124,6 +1131,7 @@ struct bpf_async_kern {
>  
>  enum bpf_async_type {
>  	BPF_ASYNC_TYPE_TIMER = 0,
> +	BPF_ASYNC_TYPE_WQ,
>  };
>  
>  static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
> @@ -1167,11 +1175,75 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
>  	return HRTIMER_NORESTART;
>  }
>  
> +static void bpf_wq_work(struct work_struct *work)
> +{
> +	struct bpf_work *w = container_of(work, struct bpf_work, work);
> +	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
> +	struct bpf_prog *prog = w->cb.prog;
> +	unsigned int flags = w->cb.flags;
> +	struct bpf_map *map = w->cb.map;
> +	bpf_callback_t callback_fn;
> +	void *value = w->cb.value;
> +	void *key;
> +	u32 idx;
> +
> +	BTF_TYPE_EMIT(struct bpf_wq);
> +
> +	callback_fn = READ_ONCE(w->cb.callback_fn);
> +	if (!callback_fn || !prog)
> +		return;
> +
> +	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
> +		struct bpf_array *array = container_of(map, struct bpf_array, map);
> +
> +		/* compute the key */
> +		idx = ((char *)value - array->value) / array->elem_size;
> +		key = &idx;
> +	} else { /* hash or lru */
> +		key = value - round_up(map->key_size, 8);
> +	}
> +
> +	run_ctx.bpf_cookie = 0;
> +
> +	if (flags & BPF_F_WQ_SLEEPABLE) {
> +		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
> +			/* recursion detected */
> +			__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
> +			return;
> +		}
> +	} else {
> +		if (!__bpf_prog_enter_recur(prog, &run_ctx)) {
> +			/* recursion detected */
> +			__bpf_prog_exit_recur(prog, 0, &run_ctx);
> +			return;
> +		}
> +	}
> +
> +	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
> +	/* The verifier checked that return value is zero. */
> +
> +	if (flags & BPF_F_WQ_SLEEPABLE)
> +		__bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
> +						&run_ctx);
> +	else
> +		__bpf_prog_exit_recur(prog, 0, &run_ctx);
> +}
> +
> +static void bpf_wq_delete_work(struct work_struct *work)
> +{
> +	struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
> +
> +	cancel_work_sync(&w->work);
> +
> +	kfree_rcu(w, cb.rcu);
> +}
> +
>  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
>  			    enum bpf_async_type type)
>  {
>  	struct bpf_async_cb *cb;
>  	struct bpf_hrtimer *t;
> +	struct bpf_work *w;
>  	clockid_t clockid;
>  	size_t size;
>  	int ret = 0;
> @@ -1183,6 +1255,9 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
>  	case BPF_ASYNC_TYPE_TIMER:
>  		size = sizeof(struct bpf_hrtimer);
>  		break;
> +	case BPF_ASYNC_TYPE_WQ:
> +		size = sizeof(struct bpf_work);
> +		break;
>  	default:
>  		return -EINVAL;
>  	}
> @@ -1201,13 +1276,22 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
>  		goto out;
>  	}
>  
> -	if (type == BPF_ASYNC_TYPE_TIMER) {
> +	switch (type) {
> +	case BPF_ASYNC_TYPE_TIMER:
>  		clockid = flags & (MAX_CLOCKS - 1);
>  		t = (struct bpf_hrtimer *)cb;
>  
>  		hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
>  		t->timer.function = bpf_timer_cb;
>  		cb->value = (void *)async - map->record->timer_off;
> +		break;
> +	case BPF_ASYNC_TYPE_WQ:
> +		w = (struct bpf_work *)cb;
> +
> +		INIT_WORK(&w->work, bpf_wq_work);
> +		INIT_WORK(&w->delete_work, bpf_wq_delete_work);
> +		cb->value = (void *)async - map->record->wq_off;
> +		break;
>  	}
>  	cb->map = map;
>  	cb->prog = NULL;
> @@ -1473,7 +1557,19 @@ void bpf_timer_cancel_and_free(void *val)
>   */
>  void bpf_wq_cancel_and_free(void *val)
>  {
> +	struct bpf_work *work;
> +
>  	BTF_TYPE_EMIT(struct bpf_wq);
> +
> +	work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
> +	if (!work)
> +		return;
> +	/* Trigger cancel of the sleepable work, but *do not* wait for
> +	 * it to finish if it was running as we might not be in a
> +	 * sleepable context.
> +	 * kfree will be called once the work has finished.
> +	 */
> +	schedule_work(&work->delete_work);
>  }
>  
>  BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
> @@ -2612,6 +2708,19 @@ __bpf_kfunc void bpf_throw(u64 cookie)
>  	WARN(1, "A call to BPF exception callback should never return\n");
>  }
>  
> +__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *map, unsigned int flags)
> +{
> +	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
> +
> +	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
> +	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
> +
> +	if (flags & ~BPF_F_WQ_SLEEPABLE)
> +		return -EINVAL;
> +
> +	return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
> +}
> +
>  __bpf_kfunc_end_defs();
>  
>  BTF_KFUNCS_START(generic_btf_ids)
> @@ -2689,6 +2798,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>  BTF_ID_FLAGS(func, bpf_dynptr_size)
>  BTF_ID_FLAGS(func, bpf_dynptr_clone)
>  BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
> +BTF_ID_FLAGS(func, bpf_wq_init)
>  BTF_KFUNCS_END(common_btf_ids)
>  
>  static const struct btf_kfunc_id_set common_kfunc_set = {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 112faf2cd7e9..5e8c1e65fe8c 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -11038,6 +11038,7 @@ enum special_kfunc_type {
>  	KF_bpf_percpu_obj_drop_impl,
>  	KF_bpf_throw,
>  	KF_bpf_iter_css_task_new,
> +	KF_bpf_wq_init,
>  };
>  
>  BTF_SET_START(special_kfunc_set)
> @@ -11064,6 +11065,7 @@ BTF_ID(func, bpf_throw)
>  #ifdef CONFIG_CGROUPS
>  BTF_ID(func, bpf_iter_css_task_new)
>  #endif
> +BTF_ID(func, bpf_wq_init)
>  BTF_SET_END(special_kfunc_set)
>  
>  BTF_ID_LIST(special_kfunc_list)
> @@ -11094,6 +11096,7 @@ BTF_ID(func, bpf_iter_css_task_new)
>  #else
>  BTF_ID_UNUSED
>  #endif
> +BTF_ID(func, bpf_wq_init)
>  
>  static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
>  {
> @@ -11171,6 +11174,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
>  	if (is_kfunc_arg_wq(meta->btf, &args[argno]))
>  		return KF_ARG_PTR_TO_WORKQUEUE;
>  
> +	if (meta->func_id == special_kfunc_list[KF_bpf_wq_init] && argno == 1)
> +		return KF_ARG_PTR_TO_MAP;
> +

Hmm. This function has this bit:
        if (is_kfunc_arg_map(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_MAP;

Just do:
+__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *map__map, ...

It was specifically added for bpf_arena_alloc_pages to pass pointer to a map.
In case of arena map type it's used as:
struct {
        __uint(type, BPF_MAP_TYPE_ARENA);
        ...
} arena SEC(".maps");

page = bpf_arena_alloc_pages(&arena, ...)
libbpf and the verifier do the right thing.
I think it should work here as well.
Benjamin Tissoires April 19, 2024, 3:12 p.m. UTC | #2
On Apr 18 2024, Alexei Starovoitov wrote:
> On Tue, Apr 16, 2024 at 04:08:24PM +0200, Benjamin Tissoires wrote:
> > We need to teach the verifier about the second argument which is declared
> > as void * but which is of type KF_ARG_PTR_TO_MAP. We could have dropped
> > this extra case if we declared the second argument as struct bpf_map *,
> > but that means users will have to do extra casting to have their program
> > compile.
> > 
> > We also need to duplicate the timer code for the checking if the map
> > argument is matching the provided workqueue.
> > 
> > Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
> > 
> > ---
> > 
> > Note that the timer code when matching for the map is checking for
> > constant map pointers. I wonder if this needs to be enforced too
> > (being constant?)
> > ---
> >  include/uapi/linux/bpf.h |   9 ++++
> >  kernel/bpf/helpers.c     | 114 ++++++++++++++++++++++++++++++++++++++++++++++-
> >  kernel/bpf/verifier.c    |   6 +++
> >  3 files changed, 127 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index e4ae83550fb3..519f6019d158 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -7502,4 +7502,13 @@ struct bpf_iter_num {
> >  	__u64 __opaque[1];
> >  } __attribute__((aligned(8)));
> >  
> > +/*
> > + * Flags to control bpf_wq_init() and bpf_wq_set_callback() behaviour.
> > + *     - BPF_F_WQ_SLEEPABLE: the callback needs to run in
> > + *       a sleepable context
> > + */
> > +enum {
> > +	BPF_F_WQ_SLEEPABLE = (1ULL << 0),
> > +};
> 
> Just started looking at the patch set. The first reaction that
> this flag is odd. Why add it? wq provides sleepable ctx.
> Why would the program ask to be non-sleepable in wq?

That was one of my questions: do we need it? Apparently not :)

If not, then that simplifies the series by a lot: patch 1/18 can be
dropped and 14/18 can be stripped down and squashed in 15/18.

> If it needs to callback to run in rcu cs, it can use bpf_rcu_read_lock() kfunc
> as the first call in such callback and it will be equivalent
> to not-passing this BPF_F_WQ_SLEEPABLE flag.
> It seem it can be dropped and complexity reduced.

yep :)

> The verifier complications in later patches due to this flag too...
> I just don't see the point.

It's something I added while adding the tests. And some tests were passing
in case I was having a non sleepable callback. But if we have
bpf_rcu_read_lock(), we are all fine and can reduce the complexity.

> 
> > +
> >  #endif /* _UAPI__LINUX_BPF_H__ */
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index 9fd12d480b8b..9ac1b8bb3a01 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -1109,11 +1109,18 @@ struct bpf_hrtimer {
> >  	struct hrtimer timer;
> >  };
> >  
> > -/* the actual struct hidden inside uapi struct bpf_timer */
> > +struct bpf_work {
> > +	struct bpf_async_cb cb;
> > +	struct work_struct work;
> > +	struct work_struct delete_work;
> > +};
> > +
> > +/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
> >  struct bpf_async_kern {
> >  	union {
> >  		struct bpf_async_cb *cb;
> >  		struct bpf_hrtimer *timer;
> > +		struct bpf_work *work;
> >  	};
> >  	/* bpf_spin_lock is used here instead of spinlock_t to make
> >  	 * sure that it always fits into space reserved by struct bpf_timer
> > @@ -1124,6 +1131,7 @@ struct bpf_async_kern {
> >  
> >  enum bpf_async_type {
> >  	BPF_ASYNC_TYPE_TIMER = 0,
> > +	BPF_ASYNC_TYPE_WQ,
> >  };
> >  
> >  static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
> > @@ -1167,11 +1175,75 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
> >  	return HRTIMER_NORESTART;
> >  }
> >  
> > +static void bpf_wq_work(struct work_struct *work)
> > +{
> > +	struct bpf_work *w = container_of(work, struct bpf_work, work);
> > +	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
> > +	struct bpf_prog *prog = w->cb.prog;
> > +	unsigned int flags = w->cb.flags;
> > +	struct bpf_map *map = w->cb.map;
> > +	bpf_callback_t callback_fn;
> > +	void *value = w->cb.value;
> > +	void *key;
> > +	u32 idx;
> > +
> > +	BTF_TYPE_EMIT(struct bpf_wq);
> > +
> > +	callback_fn = READ_ONCE(w->cb.callback_fn);
> > +	if (!callback_fn || !prog)
> > +		return;
> > +
> > +	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
> > +		struct bpf_array *array = container_of(map, struct bpf_array, map);
> > +
> > +		/* compute the key */
> > +		idx = ((char *)value - array->value) / array->elem_size;
> > +		key = &idx;
> > +	} else { /* hash or lru */
> > +		key = value - round_up(map->key_size, 8);
> > +	}
> > +
> > +	run_ctx.bpf_cookie = 0;
> > +
> > +	if (flags & BPF_F_WQ_SLEEPABLE) {
> > +		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
> > +			/* recursion detected */
> > +			__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
> > +			return;
> > +		}
> > +	} else {
> > +		if (!__bpf_prog_enter_recur(prog, &run_ctx)) {
> > +			/* recursion detected */
> > +			__bpf_prog_exit_recur(prog, 0, &run_ctx);
> > +			return;
> > +		}
> > +	}
> > +
> > +	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
> > +	/* The verifier checked that return value is zero. */
> > +
> > +	if (flags & BPF_F_WQ_SLEEPABLE)
> > +		__bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
> > +						&run_ctx);
> > +	else
> > +		__bpf_prog_exit_recur(prog, 0, &run_ctx);
> > +}
> > +
> > +static void bpf_wq_delete_work(struct work_struct *work)
> > +{
> > +	struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
> > +
> > +	cancel_work_sync(&w->work);
> > +
> > +	kfree_rcu(w, cb.rcu);
> > +}
> > +
> >  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
> >  			    enum bpf_async_type type)
> >  {
> >  	struct bpf_async_cb *cb;
> >  	struct bpf_hrtimer *t;
> > +	struct bpf_work *w;
> >  	clockid_t clockid;
> >  	size_t size;
> >  	int ret = 0;
> > @@ -1183,6 +1255,9 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
> >  	case BPF_ASYNC_TYPE_TIMER:
> >  		size = sizeof(struct bpf_hrtimer);
> >  		break;
> > +	case BPF_ASYNC_TYPE_WQ:
> > +		size = sizeof(struct bpf_work);
> > +		break;
> >  	default:
> >  		return -EINVAL;
> >  	}
> > @@ -1201,13 +1276,22 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
> >  		goto out;
> >  	}
> >  
> > -	if (type == BPF_ASYNC_TYPE_TIMER) {
> > +	switch (type) {
> > +	case BPF_ASYNC_TYPE_TIMER:
> >  		clockid = flags & (MAX_CLOCKS - 1);
> >  		t = (struct bpf_hrtimer *)cb;
> >  
> >  		hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
> >  		t->timer.function = bpf_timer_cb;
> >  		cb->value = (void *)async - map->record->timer_off;
> > +		break;
> > +	case BPF_ASYNC_TYPE_WQ:
> > +		w = (struct bpf_work *)cb;
> > +
> > +		INIT_WORK(&w->work, bpf_wq_work);
> > +		INIT_WORK(&w->delete_work, bpf_wq_delete_work);
> > +		cb->value = (void *)async - map->record->wq_off;
> > +		break;
> >  	}
> >  	cb->map = map;
> >  	cb->prog = NULL;
> > @@ -1473,7 +1557,19 @@ void bpf_timer_cancel_and_free(void *val)
> >   */
> >  void bpf_wq_cancel_and_free(void *val)
> >  {
> > +	struct bpf_work *work;
> > +
> >  	BTF_TYPE_EMIT(struct bpf_wq);
> > +
> > +	work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
> > +	if (!work)
> > +		return;
> > +	/* Trigger cancel of the sleepable work, but *do not* wait for
> > +	 * it to finish if it was running as we might not be in a
> > +	 * sleepable context.
> > +	 * kfree will be called once the work has finished.
> > +	 */
> > +	schedule_work(&work->delete_work);
> >  }
> >  
> >  BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
> > @@ -2612,6 +2708,19 @@ __bpf_kfunc void bpf_throw(u64 cookie)
> >  	WARN(1, "A call to BPF exception callback should never return\n");
> >  }
> >  
> > +__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *map, unsigned int flags)
> > +{
> > +	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
> > +
> > +	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
> > +	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
> > +
> > +	if (flags & ~BPF_F_WQ_SLEEPABLE)
> > +		return -EINVAL;
> > +
> > +	return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
> > +}
> > +
> >  __bpf_kfunc_end_defs();
> >  
> >  BTF_KFUNCS_START(generic_btf_ids)
> > @@ -2689,6 +2798,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> >  BTF_ID_FLAGS(func, bpf_dynptr_size)
> >  BTF_ID_FLAGS(func, bpf_dynptr_clone)
> >  BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
> > +BTF_ID_FLAGS(func, bpf_wq_init)
> >  BTF_KFUNCS_END(common_btf_ids)
> >  
> >  static const struct btf_kfunc_id_set common_kfunc_set = {
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 112faf2cd7e9..5e8c1e65fe8c 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -11038,6 +11038,7 @@ enum special_kfunc_type {
> >  	KF_bpf_percpu_obj_drop_impl,
> >  	KF_bpf_throw,
> >  	KF_bpf_iter_css_task_new,
> > +	KF_bpf_wq_init,
> >  };
> >  
> >  BTF_SET_START(special_kfunc_set)
> > @@ -11064,6 +11065,7 @@ BTF_ID(func, bpf_throw)
> >  #ifdef CONFIG_CGROUPS
> >  BTF_ID(func, bpf_iter_css_task_new)
> >  #endif
> > +BTF_ID(func, bpf_wq_init)
> >  BTF_SET_END(special_kfunc_set)
> >  
> >  BTF_ID_LIST(special_kfunc_list)
> > @@ -11094,6 +11096,7 @@ BTF_ID(func, bpf_iter_css_task_new)
> >  #else
> >  BTF_ID_UNUSED
> >  #endif
> > +BTF_ID(func, bpf_wq_init)
> >  
> >  static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
> >  {
> > @@ -11171,6 +11174,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
> >  	if (is_kfunc_arg_wq(meta->btf, &args[argno]))
> >  		return KF_ARG_PTR_TO_WORKQUEUE;
> >  
> > +	if (meta->func_id == special_kfunc_list[KF_bpf_wq_init] && argno == 1)
> > +		return KF_ARG_PTR_TO_MAP;
> > +
> 
> Hmm. This function has this bit:
>         if (is_kfunc_arg_map(meta->btf, &args[argno]))
>                 return KF_ARG_PTR_TO_MAP;
> 
> Just do:
> +__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *map__map, ...

Oh, nice. I haven't noticed that. I'll try it in v2.

Cheers,
Benjamin

> 
> It was specifically added for bpf_arena_alloc_pages to pass pointer to a map.
> In case of arena map type it's used as:
> struct {
>         __uint(type, BPF_MAP_TYPE_ARENA);
>         ...
> } arena SEC(".maps");
> 
> page = bpf_arena_alloc_pages(&arena, ...)
> libbpf and the verifier do the right thing.
> I think it should work here as well.
Alexei Starovoitov April 19, 2024, 3:34 p.m. UTC | #3
On Fri, Apr 19, 2024 at 8:12 AM Benjamin Tissoires <bentiss@kernel.org> wrote:
>
>
> It's something I added while adding the tests. And some tests were passing
> in case I was having a non sleepable callback. But if we have
> bpf_rcu_read_lock(), we are all fine and can reduce the complexity.

Not quite following what was the issue.
Since the verifier is unconditionally verifying such callback as sleepable
the callback won't be able to access rcu pointers without doing
explicit bpf_rcu_read_lock() first (and few other code patterns
might be rejected), but that's a good thing.
Maybe next to set_cb kfunc add a comment that wq callbacks are sleepable.
I think bpf prog writers are often with kernel background,
so it will be their natural assumption that cb is sleepable.
Benjamin Tissoires April 19, 2024, 3:55 p.m. UTC | #4
On Apr 19 2024, Alexei Starovoitov wrote:
> On Fri, Apr 19, 2024 at 8:12 AM Benjamin Tissoires <bentiss@kernel.org> wrote:
> >
> >
> > It's something I added while adding the tests. And some tests were passing
> > in case I was having a non sleepable callback. But if we have
> > bpf_rcu_read_lock(), we are all fine and can reduce the complexity.
> 
> Not quite following what was the issue.
> Since the verifier is unconditionally verifying such callback as sleepable
> the callback won't be able to access rcu pointers without doing
> explicit bpf_rcu_read_lock() first (and few other code patterns
> might be rejected), but that's a good thing.

Oh, I missed that. Well, given that the verifier enforces everything, I
guess we are good :)

> Maybe next to set_cb kfunc add a comment that wq callbacks are sleepable.
> I think bpf prog writers are often with kernel background,
> so it will be their natural assumption that cb is sleepable.

I assume so as well.

Cheers,
Benjamin
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e4ae83550fb3..519f6019d158 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7502,4 +7502,13 @@  struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+/*
+ * Flags to control bpf_wq_init() and bpf_wq_set_callback() behaviour.
+ *     - BPF_F_WQ_SLEEPABLE: the callback needs to run in
+ *       a sleepable context
+ */
+enum {
+	BPF_F_WQ_SLEEPABLE = (1ULL << 0),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9fd12d480b8b..9ac1b8bb3a01 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1109,11 +1109,18 @@  struct bpf_hrtimer {
 	struct hrtimer timer;
 };
 
-/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_work {
+	struct bpf_async_cb cb;
+	struct work_struct work;
+	struct work_struct delete_work;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
 struct bpf_async_kern {
 	union {
 		struct bpf_async_cb *cb;
 		struct bpf_hrtimer *timer;
+		struct bpf_work *work;
 	};
 	/* bpf_spin_lock is used here instead of spinlock_t to make
 	 * sure that it always fits into space reserved by struct bpf_timer
@@ -1124,6 +1131,7 @@  struct bpf_async_kern {
 
 enum bpf_async_type {
 	BPF_ASYNC_TYPE_TIMER = 0,
+	BPF_ASYNC_TYPE_WQ,
 };
 
 static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
@@ -1167,11 +1175,75 @@  static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
 	return HRTIMER_NORESTART;
 }
 
+static void bpf_wq_work(struct work_struct *work)
+{
+	struct bpf_work *w = container_of(work, struct bpf_work, work);
+	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
+	struct bpf_prog *prog = w->cb.prog;
+	unsigned int flags = w->cb.flags;
+	struct bpf_map *map = w->cb.map;
+	bpf_callback_t callback_fn;
+	void *value = w->cb.value;
+	void *key;
+	u32 idx;
+
+	BTF_TYPE_EMIT(struct bpf_wq);
+
+	callback_fn = READ_ONCE(w->cb.callback_fn);
+	if (!callback_fn || !prog)
+		return;
+
+	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+		/* compute the key */
+		idx = ((char *)value - array->value) / array->elem_size;
+		key = &idx;
+	} else { /* hash or lru */
+		key = value - round_up(map->key_size, 8);
+	}
+
+	run_ctx.bpf_cookie = 0;
+
+	if (flags & BPF_F_WQ_SLEEPABLE) {
+		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
+			/* recursion detected */
+			__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
+			return;
+		}
+	} else {
+		if (!__bpf_prog_enter_recur(prog, &run_ctx)) {
+			/* recursion detected */
+			__bpf_prog_exit_recur(prog, 0, &run_ctx);
+			return;
+		}
+	}
+
+	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+	/* The verifier checked that return value is zero. */
+
+	if (flags & BPF_F_WQ_SLEEPABLE)
+		__bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+						&run_ctx);
+	else
+		__bpf_prog_exit_recur(prog, 0, &run_ctx);
+}
+
+static void bpf_wq_delete_work(struct work_struct *work)
+{
+	struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
+
+	cancel_work_sync(&w->work);
+
+	kfree_rcu(w, cb.rcu);
+}
+
 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
 			    enum bpf_async_type type)
 {
 	struct bpf_async_cb *cb;
 	struct bpf_hrtimer *t;
+	struct bpf_work *w;
 	clockid_t clockid;
 	size_t size;
 	int ret = 0;
@@ -1183,6 +1255,9 @@  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 	case BPF_ASYNC_TYPE_TIMER:
 		size = sizeof(struct bpf_hrtimer);
 		break;
+	case BPF_ASYNC_TYPE_WQ:
+		size = sizeof(struct bpf_work);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1201,13 +1276,22 @@  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 		goto out;
 	}
 
-	if (type == BPF_ASYNC_TYPE_TIMER) {
+	switch (type) {
+	case BPF_ASYNC_TYPE_TIMER:
 		clockid = flags & (MAX_CLOCKS - 1);
 		t = (struct bpf_hrtimer *)cb;
 
 		hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
 		t->timer.function = bpf_timer_cb;
 		cb->value = (void *)async - map->record->timer_off;
+		break;
+	case BPF_ASYNC_TYPE_WQ:
+		w = (struct bpf_work *)cb;
+
+		INIT_WORK(&w->work, bpf_wq_work);
+		INIT_WORK(&w->delete_work, bpf_wq_delete_work);
+		cb->value = (void *)async - map->record->wq_off;
+		break;
 	}
 	cb->map = map;
 	cb->prog = NULL;
@@ -1473,7 +1557,19 @@  void bpf_timer_cancel_and_free(void *val)
  */
 void bpf_wq_cancel_and_free(void *val)
 {
+	struct bpf_work *work;
+
 	BTF_TYPE_EMIT(struct bpf_wq);
+
+	work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
+	if (!work)
+		return;
+	/* Trigger cancel of the sleepable work, but *do not* wait for
+	 * it to finish if it was running as we might not be in a
+	 * sleepable context.
+	 * kfree will be called once the work has finished.
+	 */
+	schedule_work(&work->delete_work);
 }
 
 BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
@@ -2612,6 +2708,19 @@  __bpf_kfunc void bpf_throw(u64 cookie)
 	WARN(1, "A call to BPF exception callback should never return\n");
 }
 
+__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *map, unsigned int flags)
+{
+	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+
+	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
+	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
+
+	if (flags & ~BPF_F_WQ_SLEEPABLE)
+		return -EINVAL;
+
+	return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -2689,6 +2798,7 @@  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
 BTF_ID_FLAGS(func, bpf_dynptr_size)
 BTF_ID_FLAGS(func, bpf_dynptr_clone)
 BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+BTF_ID_FLAGS(func, bpf_wq_init)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 112faf2cd7e9..5e8c1e65fe8c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11038,6 +11038,7 @@  enum special_kfunc_type {
 	KF_bpf_percpu_obj_drop_impl,
 	KF_bpf_throw,
 	KF_bpf_iter_css_task_new,
+	KF_bpf_wq_init,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -11064,6 +11065,7 @@  BTF_ID(func, bpf_throw)
 #ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
 #endif
+BTF_ID(func, bpf_wq_init)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -11094,6 +11096,7 @@  BTF_ID(func, bpf_iter_css_task_new)
 #else
 BTF_ID_UNUSED
 #endif
+BTF_ID(func, bpf_wq_init)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -11171,6 +11174,9 @@  get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_wq(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_WORKQUEUE;
 
+	if (meta->func_id == special_kfunc_list[KF_bpf_wq_init] && argno == 1)
+		return KF_ARG_PTR_TO_MAP;
+
 	if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
 		if (!btf_type_is_struct(ref_t)) {
 			verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",