@@ -27,6 +27,8 @@
#include <linux/bpfptr.h>
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
@@ -460,6 +462,7 @@ enum bpf_arg_type {
ARG_PTR_TO_TIMER, /* pointer to bpf_timer */
ARG_PTR_TO_KPTR, /* pointer to referenced kptr */
ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
+ ARG_PTR_TO_DELAYED_WORK,/* pointer to bpf_delayed_work */
__BPF_ARG_TYPE_MAX,
/* Extended arg_types. */
@@ -1101,6 +1104,9 @@ struct bpf_prog_aux {
u32 linfo_idx;
u32 num_exentries;
struct exception_table_entry *extable;
+
+ /* initialized at load time if program uses delayed work helpers */
+ struct bpf_delayed_irq_work *irq_work;
union {
struct work_struct work;
struct rcu_head rcu;
@@ -2526,4 +2532,11 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
int bpf_dynptr_check_size(u32 size);
+struct bpf_delayed_irq_work {
+ struct llist_head items;
+ struct irq_work work;
+ struct bpf_prog *prog;
+};
+void bpf_delayed_work_irq_work_cb(struct irq_work *work);
+
#endif /* _LINUX_BPF_H */
@@ -5325,6 +5325,29 @@ union bpf_attr {
* **-EACCES** if the SYN cookie is not valid.
*
* **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ *
+ * long bpf_delayed_work_submit(struct bpf_delayed_work *work, void *cb, void *data, int flags)
+ * Description
+ * Submits a function to execute in a different context.
+ *
+ * *work* must be a member in a map value.
+ *
+ * *cb* function to call
+ *
+ * *data* context to pass as sole argument to *cb*. Must be part of
+ * a map value or NULL.
+ *
+ * *flags* must be BPF_DELAYED_WORK_IRQWORK
+ * Return
+ * 0 when work is successfully submitted.
+ *
+ * **-EINVAL** if *cb* is NULL
+ *
+ * **-EOPNOTSUP** if called from an NMI handler on an
+ * architecture without NMI-safe cmpxchg
+ *
+ * **-EINVAL** if *work* is already in use
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -5535,6 +5558,7 @@ union bpf_attr {
FN(tcp_raw_gen_syncookie_ipv6), \
FN(tcp_raw_check_syncookie_ipv4), \
FN(tcp_raw_check_syncookie_ipv6), \
+ FN(delayed_work_submit), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6699,6 +6723,10 @@ struct bpf_delayed_work {
__u64 :64;
} __attribute__((aligned(8)));
+enum {
+ BPF_DELAYED_WORK_IRQWORK = (1UL << 0),
+};
+
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.
@@ -2567,6 +2567,14 @@ static void bpf_prog_free_deferred(struct work_struct *work)
int i;
aux = container_of(work, struct bpf_prog_aux, work);
+
+ /* We have already waited for a qs of the appropriate RCU variety,
+ * so we can expect no further submissions of work. Just wait for
+ * the currently scheduled work to finish before releasing anything.
+ */
+ if (aux->irq_work)
+ irq_work_sync(&aux->irq_work->work);
+
#ifdef CONFIG_BPF_SYSCALL
bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
#endif
@@ -18,6 +18,8 @@
#include <linux/proc_ns.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
#include "../../lib/kstrtox.h"
@@ -1575,6 +1577,94 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
.arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
};
+struct bpf_delayed_work_kern {
+ struct llist_node item;
+ u64 flags; /* used as a lock field */
+ void (*cb)(void *);
+ void *data;
+} __aligned(8);
+
+#define BPF_DELAYED_WORK_FREE (0)
+#define BPF_DELAYED_WORK_CLAIMED (1)
+#define BPF_DELAYED_WORK_READY (2)
+
+void bpf_delayed_work_irq_work_cb(struct irq_work *work)
+{
+ struct bpf_delayed_irq_work *bpf_irq_work = container_of(work, struct bpf_delayed_irq_work, work);
+ struct bpf_delayed_work_kern *work_item, *next;
+ struct llist_node *work_list = llist_del_all(&bpf_irq_work->items);
+
+ /* Traverse in submission order to preserve ordering semantics */
+ llist_reverse_order(work_list);
+
+ llist_for_each_entry_safe(work_item, next, work_list, item) {
+ WARN_ONCE(work_item->flags != BPF_DELAYED_WORK_READY, "incomplete bpf_delayed_work found");
+
+ work_item->cb(work_item->data);
+
+ work_item->cb = work_item->data = NULL;
+ bpf_prog_put(bpf_irq_work->prog);
+ xchg(&work_item->flags, BPF_DELAYED_WORK_FREE);
+ }
+}
+
+BPF_CALL_5(bpf_delayed_work_submit, struct bpf_delayed_work_kern *, work,
+ void *, callback_fn, void *, data, int, flags, struct bpf_prog_aux *, aux)
+{
+ u64 ret;
+ struct bpf_prog *prog;
+
+ BUILD_BUG_ON(sizeof(struct bpf_delayed_work_kern) > sizeof(struct bpf_delayed_work));
+ BUILD_BUG_ON(__alignof__(struct bpf_delayed_work_kern) != __alignof__(struct bpf_delayed_work));
+ BTF_TYPE_EMIT(struct bpf_delayed_work);
+
+ if (callback_fn == NULL)
+ return -EINVAL;
+
+ if (flags != BPF_DELAYED_WORK_IRQWORK)
+ return -EOPNOTSUPP;
+
+ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && in_nmi())
+ return -EOPNOTSUPP;
+
+ ret = cmpxchg(&work->flags, BPF_DELAYED_WORK_FREE, BPF_DELAYED_WORK_CLAIMED);
+ if (ret != 0)
+ return -EINVAL;
+
+ work->data = data;
+ work->cb = callback_fn;
+
+ ret = cmpxchg(&work->flags, BPF_DELAYED_WORK_CLAIMED, BPF_DELAYED_WORK_READY);
+ if (ret != BPF_DELAYED_WORK_CLAIMED) {
+ WARN_ONCE(ret != BPF_DELAYED_WORK_CLAIMED, "bpf_delayed_work item altered while claimed");
+ return -EINVAL;
+ }
+
+ /* Bump the ref count for every work item submitted by the program. */
+ prog = bpf_prog_inc_not_zero(aux->prog);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ llist_add(&work->item, &aux->irq_work->items);
+
+ /* It's okay if this prog's irq_work is already submitted,
+ * it will walk the same list of callbacks anyway.
+ */
+ (void) irq_work_queue(&aux->irq_work->work);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_delayed_work_submit_proto = {
+ .func = bpf_delayed_work_submit,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_DELAYED_WORK,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE, /* TODO: need ptr_to_map_value_mem */
+ .arg4_type = ARG_ANYTHING,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1643,6 +1733,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_dynptr_write_proto;
case BPF_FUNC_dynptr_data:
return &bpf_dynptr_data_proto;
+ case BPF_FUNC_delayed_work_submit:
+ return &bpf_delayed_work_submit_proto;
default:
break;
}
@@ -5490,6 +5490,55 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_delayed_work_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno];
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_delayed_work has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_delayed_work\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_delayed_work(map)) {
+ if (map->delayed_work_off == -E2BIG)
+ verbose(env,
+ "map '%s' has more than one 'struct bpf_delayed_work'\n",
+ map->name);
+ else if (map->delayed_work_off == -ENOENT)
+ verbose(env,
+ "map '%s' doesn't have 'struct bpf_delayed_work'\n",
+ map->name);
+ else
+ verbose(env,
+ "map '%s' is not a struct type or bpf_delayed_work is mangled\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (map->delayed_work_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_delayed_work' that is at %d\n",
+ val + reg->off, map->delayed_work_off);
+ return -EINVAL;
+ }
+ if (meta->map_ptr) {
+ verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ return -EFAULT;
+ }
+
+ meta->map_uid = reg->map_uid;
+ meta->map_ptr = map;
+ return 0;
+}
+
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
@@ -5677,6 +5726,7 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK }
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types delayed_work_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@@ -5704,6 +5754,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_TIMER] = &timer_types,
[ARG_PTR_TO_KPTR] = &kptr_types,
[ARG_PTR_TO_DYNPTR] = &stack_ptr_types,
+ [ARG_PTR_TO_DELAYED_WORK] = &delayed_work_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -6018,6 +6069,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
} else if (arg_type == ARG_PTR_TO_TIMER) {
if (process_timer_func(env, regno, meta))
return -EACCES;
+ } else if (arg_type == ARG_PTR_TO_DELAYED_WORK) {
+ if (process_delayed_work_func(env, regno, meta))
+ return -EACCES;
} else if (arg_type == ARG_PTR_TO_FUNC) {
meta->subprogno = reg->subprogno;
} else if (base_type(arg_type) == ARG_PTR_TO_MEM) {
@@ -6670,7 +6724,8 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (insn->code == (BPF_JMP | BPF_CALL) &&
insn->src_reg == 0 &&
- insn->imm == BPF_FUNC_timer_set_callback) {
+ (insn->imm == BPF_FUNC_timer_set_callback ||
+ insn->imm == BPF_FUNC_delayed_work_submit)) {
struct bpf_verifier_state *async_cb;
/* there is no real recursion here. timer callbacks are async */
@@ -6898,6 +6953,30 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_delayed_work_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_delayed_work_submit(struct bpf_delayed_work *work,
+ * void *callback_fn, void *data, u64 flags);
+ *
+ * callback_fn(void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = caller->regs[BPF_REG_3].map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_2]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -7294,6 +7373,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
reg_type_str(env, regs[BPF_REG_1].type));
return -EACCES;
}
+ break;
+ case BPF_FUNC_delayed_work_submit:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_delayed_work_callback_state);
+ break;
}
if (err)
@@ -7468,6 +7552,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
env->prog->call_get_stack = true;
+ if (func_id == BPF_FUNC_delayed_work_submit) {
+ struct bpf_delayed_irq_work *irq_work = kmalloc(
+ sizeof(struct bpf_delayed_irq_work), GFP_KERNEL);
+ if (!irq_work) {
+ verbose(env, "could not allocate irq_work");
+ return -ENOMEM;
+ }
+
+ init_llist_head(&irq_work->items);
+ irq_work->work = IRQ_WORK_INIT_HARD(&bpf_delayed_work_irq_work_cb);
+ irq_work->prog = env->prog;
+ env->prog->aux->irq_work = irq_work;
+ }
+
+
if (func_id == BPF_FUNC_get_func_ip) {
if (check_get_func_ip(env))
return -ENOTSUPP;
@@ -14061,6 +14160,28 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (insn->imm == BPF_FUNC_delayed_work_submit) {
+ // Add aux as the 5th arg to delayed_work_submit
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_5, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+
if (insn->imm == BPF_FUNC_task_storage_get ||
insn->imm == BPF_FUNC_sk_storage_get ||
insn->imm == BPF_FUNC_inode_storage_get) {
@@ -637,6 +637,7 @@ class PrinterHelpers(Printer):
'struct bpf_dynptr',
'struct iphdr',
'struct ipv6hdr',
+ 'struct bpf_delayed_work',
]
known_types = {
'...',
@@ -690,6 +691,7 @@ class PrinterHelpers(Printer):
'struct bpf_dynptr',
'struct iphdr',
'struct ipv6hdr',
+ 'struct bpf_delayed_work',
}
mapped_types = {
'u8': '__u8',
@@ -5325,6 +5325,28 @@ union bpf_attr {
* **-EACCES** if the SYN cookie is not valid.
*
* **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_delayed_work_submit(struct bpf_delayed_work *work, void *cb, void *data, int flags)
+ * Description
+ * Submits a function to execute in a different context.
+ *
+ * *work* must be a member in a map value.
+ *
+ * *cb* function to call
+ *
+ * *data* context to pass as sole argument to *cb*. Must be part of
+ * a map value or NULL.
+ *
+ * *flags* must be BPF_DELAYED_WORK_IRQWORK
+ * Return
+ * 0 when work is successfully submitted.
+ *
+ * **-EINVAL** if *cb* is NULL
+ *
+ * **-EOPNOTSUP** if called from an NMI handler on an
+ * architecture without NMI-safe cmpxchg
+ *
+ * **-EINVAL** if *work* is already in use
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -5535,6 +5557,7 @@ union bpf_attr {
FN(tcp_raw_gen_syncookie_ipv6), \
FN(tcp_raw_check_syncookie_ipv4), \
FN(tcp_raw_check_syncookie_ipv6), \
+ FN(delayed_work_submit), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6699,6 +6722,10 @@ struct bpf_delayed_work {
__u64 :64;
} __attribute__((aligned(8)));
+enum {
+ BPF_DELAYED_WORK_IRQWORK = (1UL << 0),
+};
+
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.
Add a new helper function that can schedule a callback to execute in a different context. Initially, only irq_work (i.e. hardirq) is supported. A key consideration is that we need this to work in an NMI context. Therefore, we use a queue of pre-allocated llist nodes inside bpf_delayed_work, which we drain on a per-program basis. To avoid races on the bpf_delayed_work items, we implement a simple lock scheme based on cmpxchg ordering. Signed-off-by: Delyan Kratunov <delyank@fb.com> --- include/linux/bpf.h | 13 ++++ include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/core.c | 8 +++ kernel/bpf/helpers.c | 92 ++++++++++++++++++++++++ kernel/bpf/verifier.c | 123 ++++++++++++++++++++++++++++++++- scripts/bpf_doc.py | 2 + tools/include/uapi/linux/bpf.h | 27 ++++++++ 7 files changed, 292 insertions(+), 1 deletion(-)