Message ID | 5e5b91efc6e06a90fb4d2440ddcbe9b55ee464be.1726132802.git.tanggeliang@kylinos.cn (mailing list archive) |
---|---|
State | Awaiting Upstream |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [mptcp-next,v5,1/5] bpf: Add mptcp_subflow bpf_iter | expand |
On Thu, Sep 12, 2024 at 2:26 AM Geliang Tang <geliang@kernel.org> wrote: > > From: Geliang Tang <tanggeliang@kylinos.cn> > > It's necessary to traverse all subflows on the conn_list of an MPTCP > socket and then call kfunc to modify the fields of each subflow. In > kernel space, mptcp_for_each_subflow() helper is used for this: > > mptcp_for_each_subflow(msk, subflow) > kfunc(subflow); > > But in the MPTCP BPF program, this has not yet been implemented. As > Martin suggested recently, this conn_list walking + modify-by-kfunc > usage fits the bpf_iter use case. So this patch adds a new bpf_iter > type named "mptcp_subflow" to do this and implements its helpers > bpf_iter_mptcp_subflow_new()/_next()/_destroy(). > > Since these bpf_iter mptcp_subflow helpers are invoked in its selftest > in a ftrace hook for mptcp_sched_get_send(), it's necessary to register > them into a BPF_PROG_TYPE_TRACING type kfunc set together with other > two used kfuncs mptcp_subflow_active() and mptcp_subflow_set_scheduled(). > > Then bpf_for_each() for mptcp_subflow can be used in BPF program like > this: > > i = 0; > bpf_rcu_read_lock(); > bpf_for_each(mptcp_subflow, subflow, msk) { > if (i++ >= MPTCP_SUBFLOWS_MAX) > break; > kfunc(subflow); > } > bpf_rcu_read_unlock(); > > v2: remove msk->pm.lock in _new() and _destroy() (Martin) > drop DEFINE_BPF_ITER_FUNC, change opaque[3] to opaque[2] (Andrii) > v3: drop bpf_iter__mptcp_subflow > v4: if msk is NULL, initialize kit->msk to NULL in _new() and check it in > _next() (Andrii) > > Suggested-by: Martin KaFai Lau <martin.lau@kernel.org> > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn> > --- > net/mptcp/bpf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 52 insertions(+), 5 deletions(-) > Looks ok from setting up open-coded iterator things, but I can't speak for other aspects I mentioned below. > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c > index 6414824402e6..fec18e7e5e4a 100644 > --- a/net/mptcp/bpf.c > +++ b/net/mptcp/bpf.c > @@ -201,9 +201,51 @@ static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = { > .set = &bpf_mptcp_fmodret_ids, > }; > > -__diag_push(); > -__diag_ignore_all("-Wmissing-prototypes", > - "kfuncs which will be used in BPF programs"); > +struct bpf_iter_mptcp_subflow { > + __u64 __opaque[2]; > +} __attribute__((aligned(8))); > + > +struct bpf_iter_mptcp_subflow_kern { > + struct mptcp_sock *msk; > + struct list_head *pos; > +} __attribute__((aligned(8))); > + > +__bpf_kfunc_start_defs(); > + > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct bpf_iter_mptcp_subflow *it, > + struct mptcp_sock *msk) > +{ > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; > + > + kit->msk = msk; > + if (!msk) > + return -EINVAL; > + > + kit->pos = &msk->conn_list; > + return 0; > +} > + > +__bpf_kfunc struct mptcp_subflow_context * > +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it) > +{ > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; > + struct mptcp_subflow_context *subflow; > + struct mptcp_sock *msk = kit->msk; > + > + if (!msk) > + return NULL; > + > + subflow = list_entry(kit->pos->next, struct mptcp_subflow_context, node); > + if (!subflow || list_entry_is_head(subflow, &msk->conn_list, node)) it's a bit weird that you need both !subflow and list_entry_is_head(). Can you have NULL subflow at all? But this is the question to msk->conn_list semantics. > + return NULL; > + > + kit->pos = &subflow->node; > + return subflow; > +} > + > +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct bpf_iter_mptcp_subflow *it) > +{ > +} > > __bpf_kfunc struct mptcp_subflow_context * > bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data, unsigned int pos) > @@ -218,12 +260,15 @@ __bpf_kfunc bool bpf_mptcp_subflow_queues_empty(struct sock *sk) > return tcp_rtx_queue_empty(sk); > } > > -__diag_pop(); > +__bpf_kfunc_end_defs(); > > BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids) > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new) I'm not 100% sure, but I suspect you might need to specify KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is a valid owned kernel object. Other BPF folks might help to clarify this. > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next) > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy) > +BTF_ID_FLAGS(func, mptcp_subflow_active) > BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled) > BTF_ID_FLAGS(func, bpf_mptcp_subflow_ctx_by_pos) > -BTF_ID_FLAGS(func, mptcp_subflow_active) > BTF_ID_FLAGS(func, mptcp_set_timeout) > BTF_ID_FLAGS(func, mptcp_wnd_end) > BTF_ID_FLAGS(func, tcp_stream_memory_free) > @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void) > int ret; > > ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); > + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, > + &bpf_mptcp_sched_kfunc_set); > ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, > &bpf_mptcp_sched_kfunc_set); > #ifdef CONFIG_BPF_JIT > -- > 2.43.0 > >
Hi Andrii, On Thu, 2024-09-12 at 11:24 -0700, Andrii Nakryiko wrote: > On Thu, Sep 12, 2024 at 2:26 AM Geliang Tang <geliang@kernel.org> > wrote: > > > > From: Geliang Tang <tanggeliang@kylinos.cn> > > > > It's necessary to traverse all subflows on the conn_list of an > > MPTCP > > socket and then call kfunc to modify the fields of each subflow. In > > kernel space, mptcp_for_each_subflow() helper is used for this: > > > > mptcp_for_each_subflow(msk, subflow) > > kfunc(subflow); > > > > But in the MPTCP BPF program, this has not yet been implemented. As > > Martin suggested recently, this conn_list walking + modify-by-kfunc > > usage fits the bpf_iter use case. So this patch adds a new bpf_iter > > type named "mptcp_subflow" to do this and implements its helpers > > bpf_iter_mptcp_subflow_new()/_next()/_destroy(). > > > > Since these bpf_iter mptcp_subflow helpers are invoked in its > > selftest > > in a ftrace hook for mptcp_sched_get_send(), it's necessary to > > register > > them into a BPF_PROG_TYPE_TRACING type kfunc set together with > > other > > two used kfuncs mptcp_subflow_active() and > > mptcp_subflow_set_scheduled(). > > > > Then bpf_for_each() for mptcp_subflow can be used in BPF program > > like > > this: > > > > i = 0; > > bpf_rcu_read_lock(); > > bpf_for_each(mptcp_subflow, subflow, msk) { > > if (i++ >= MPTCP_SUBFLOWS_MAX) > > break; > > kfunc(subflow); > > } > > bpf_rcu_read_unlock(); > > > > v2: remove msk->pm.lock in _new() and _destroy() (Martin) > > drop DEFINE_BPF_ITER_FUNC, change opaque[3] to opaque[2] > > (Andrii) > > v3: drop bpf_iter__mptcp_subflow > > v4: if msk is NULL, initialize kit->msk to NULL in _new() and check > > it in > > _next() (Andrii) > > > > Suggested-by: Martin KaFai Lau <martin.lau@kernel.org> > > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn> > > --- > > net/mptcp/bpf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++- > > ---- > > 1 file changed, 52 insertions(+), 5 deletions(-) > > > > Looks ok from setting up open-coded iterator things, but I can't > speak > for other aspects I mentioned below. > > > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c > > index 6414824402e6..fec18e7e5e4a 100644 > > --- a/net/mptcp/bpf.c > > +++ b/net/mptcp/bpf.c > > @@ -201,9 +201,51 @@ static const struct btf_kfunc_id_set > > bpf_mptcp_fmodret_set = { > > .set = &bpf_mptcp_fmodret_ids, > > }; > > > > -__diag_push(); > > -__diag_ignore_all("-Wmissing-prototypes", > > - "kfuncs which will be used in BPF programs"); > > +struct bpf_iter_mptcp_subflow { > > + __u64 __opaque[2]; > > +} __attribute__((aligned(8))); > > + > > +struct bpf_iter_mptcp_subflow_kern { > > + struct mptcp_sock *msk; > > + struct list_head *pos; > > +} __attribute__((aligned(8))); > > + > > +__bpf_kfunc_start_defs(); > > + > > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct > > bpf_iter_mptcp_subflow *it, > > + struct mptcp_sock *msk) > > +{ > > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; > > + > > + kit->msk = msk; > > + if (!msk) > > + return -EINVAL; > > + > > + kit->pos = &msk->conn_list; > > + return 0; > > +} > > + > > +__bpf_kfunc struct mptcp_subflow_context * > > +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it) > > +{ > > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; > > + struct mptcp_subflow_context *subflow; > > + struct mptcp_sock *msk = kit->msk; > > + > > + if (!msk) > > + return NULL; > > + > > + subflow = list_entry(kit->pos->next, struct > > mptcp_subflow_context, node); > > + if (!subflow || list_entry_is_head(subflow, &msk- > > >conn_list, node)) > > it's a bit weird that you need both !subflow and > list_entry_is_head(). > Can you have NULL subflow at all? But this is the question to > msk->conn_list semantics. Do you mean to return subflow regardless of whether subflow is NULL? If so, we need to use list_is_last() instead of list_entry_is_head(): { struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; if (!kit->msk || list_is_last(kit->pos, &kit->msk->conn_list)) return NULL; kit->pos = kit->pos->next; return list_entry(kit->pos, struct mptcp_subflow_context, node); } Hope this is a better version. > > > + return NULL; > > + > > + kit->pos = &subflow->node; > > + return subflow; > > +} > > + > > +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct > > bpf_iter_mptcp_subflow *it) > > +{ > > +} > > > > __bpf_kfunc struct mptcp_subflow_context * > > bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data, > > unsigned int pos) > > @@ -218,12 +260,15 @@ __bpf_kfunc bool > > bpf_mptcp_subflow_queues_empty(struct sock *sk) > > return tcp_rtx_queue_empty(sk); > > } > > > > -__diag_pop(); > > +__bpf_kfunc_end_defs(); > > > > BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids) > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new) > > I'm not 100% sure, but I suspect you might need to specify > KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is a KF_TRUSTED_ARGS breaks the selftest in patch 2 [1]: ''' libbpf: prog 'trace_mptcp_sched_get_send': BPF program load failed: Invalid argument libbpf: prog 'trace_mptcp_sched_get_send': -- BEGIN PROG LOAD LOG -- 0: R1=ctx() R10=fp0 ; int BPF_PROG(trace_mptcp_sched_get_send, struct mptcp_sock *msk) @ mptcp_bpf_iter.c:13 0: (79) r6 = *(u64 *)(r1 +0) func 'mptcp_sched_get_send' arg0 has btf_id 28019 type STRUCT 'mptcp_sock' 1: R1=ctx() R6_w=ptr_mptcp_sock() ; if (bpf_get_current_pid_tgid() >> 32 != pid) @ mptcp_bpf_iter.c:17 1: (85) call bpf_get_current_pid_tgid#14 ; R0_w=scalar() 2: (77) r0 >>= 32 ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 3: (18) r1 = 0xffffb766c1684004 ; R1_w=map_value(map=mptcp_bp.bss,ks=4,vs=8,off=4) 5: (61) r1 = *(u32 *)(r1 +0) ; R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 6: (67) r1 <<= 32 ; R1_w=scalar(smax=0x7fffffff00000000,umax=0xffffffff00000000,smin32=0,sm ax32=umax32=0,var_off=(0x0; 0xffffffff00000000)) 7: (c7) r1 s>>= 32 ; R1_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff) 8: (5d) if r0 != r1 goto pc+39 ; R0_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) R1_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) ; iter = 0; @ mptcp_bpf_iter.c:20 9: (18) r8 = 0xffffb766c1684000 ; R8_w=map_value(map=mptcp_bp.bss,ks=4,vs=8) 11: (b4) w1 = 0 ; R1_w=0 12: (63) *(u32 *)(r8 +0) = r1 ; R1_w=0 R8_w=map_value(map=mptcp_bp.bss,ks=4,vs=8) ; bpf_rcu_read_lock(); @ mptcp_bpf_iter.c:22 13: (85) call bpf_rcu_read_lock#84967 ; 14: (bf) r7 = r10 ; R7_w=fp0 R10=fp0 ; iter = 0; @ mptcp_bpf_iter.c:20 15: (07) r7 += -16 ; R7_w=fp-16 ; bpf_for_each(mptcp_subflow, subflow, msk) { @ mptcp_bpf_iter.c:23 16: (bf) r1 = r7 ; R1_w=fp-16 R7_w=fp-16 17: (bf) r2 = r6 ; R2_w=ptr_mptcp_sock() R6=ptr_mptcp_sock() 18: (85) call bpf_iter_mptcp_subflow_new#62694 R2 must be referenced or trusted processed 17 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1 -- END PROG LOAD LOG -- libbpf: prog 'trace_mptcp_sched_get_send': failed to load: -22 libbpf: failed to load object 'mptcp_bpf_iter' libbpf: failed to load BPF skeleton 'mptcp_bpf_iter': -22 test_bpf_iter:FAIL:skel_open_load: mptcp_iter unexpected error: -22 #169/4 mptcp/bpf_iter:FAIL ''' I don't know how to fix it yet. > valid owned kernel object. Other BPF folks might help to clarify > this. > > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next) > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy) > > +BTF_ID_FLAGS(func, mptcp_subflow_active) But we do need to add KF_ITER_NEW/NEXT/DESTROY flags here: BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy, KF_ITER_DESTROY) With these flags, we can drop this additional test in loop: if (i++ >= MPTCP_SUBFLOWS_MAX) break; This problem has been bothering me for a while. I got "infinite loop detected" errors when walking the list with bpf_for_each(mptcp_subflow). So I had to use this additional test to break it. With these KF_ITER_NEW/NEXT/DESTROY flags, no need to use this additional test anymore. Thanks, -Geliang [1] https://patchwork.kernel.org/project/mptcp/patch/4467ab3af0f1a6c378778ca6be72753b16a1532c.1726132802.git.tanggeliang@kylinos.cn/ > > BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled) > > BTF_ID_FLAGS(func, bpf_mptcp_subflow_ctx_by_pos) > > -BTF_ID_FLAGS(func, mptcp_subflow_active) > > BTF_ID_FLAGS(func, mptcp_set_timeout) > > BTF_ID_FLAGS(func, mptcp_wnd_end) > > BTF_ID_FLAGS(func, tcp_stream_memory_free) > > @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void) > > int ret; > > > > ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); > > + ret = ret ?: > > register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, > > + > > &bpf_mptcp_sched_kfunc_set); > > ret = ret ?: > > register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, > > > > &bpf_mptcp_sched_kfunc_set); > > #ifdef CONFIG_BPF_JIT > > -- > > 2.43.0 > > > >
On Thu, Sep 12, 2024 at 9:04 PM Geliang Tang <geliang@kernel.org> wrote: > > Hi Andrii, > > On Thu, 2024-09-12 at 11:24 -0700, Andrii Nakryiko wrote: > > On Thu, Sep 12, 2024 at 2:26 AM Geliang Tang <geliang@kernel.org> > > wrote: > > > > > > From: Geliang Tang <tanggeliang@kylinos.cn> > > > > > > It's necessary to traverse all subflows on the conn_list of an > > > MPTCP > > > socket and then call kfunc to modify the fields of each subflow. In > > > kernel space, mptcp_for_each_subflow() helper is used for this: > > > > > > mptcp_for_each_subflow(msk, subflow) > > > kfunc(subflow); > > > > > > But in the MPTCP BPF program, this has not yet been implemented. As > > > Martin suggested recently, this conn_list walking + modify-by-kfunc > > > usage fits the bpf_iter use case. So this patch adds a new bpf_iter > > > type named "mptcp_subflow" to do this and implements its helpers > > > bpf_iter_mptcp_subflow_new()/_next()/_destroy(). > > > > > > Since these bpf_iter mptcp_subflow helpers are invoked in its > > > selftest > > > in a ftrace hook for mptcp_sched_get_send(), it's necessary to > > > register > > > them into a BPF_PROG_TYPE_TRACING type kfunc set together with > > > other > > > two used kfuncs mptcp_subflow_active() and > > > mptcp_subflow_set_scheduled(). > > > > > > Then bpf_for_each() for mptcp_subflow can be used in BPF program > > > like > > > this: > > > > > > i = 0; > > > bpf_rcu_read_lock(); > > > bpf_for_each(mptcp_subflow, subflow, msk) { > > > if (i++ >= MPTCP_SUBFLOWS_MAX) > > > break; > > > kfunc(subflow); > > > } > > > bpf_rcu_read_unlock(); > > > > > > v2: remove msk->pm.lock in _new() and _destroy() (Martin) > > > drop DEFINE_BPF_ITER_FUNC, change opaque[3] to opaque[2] > > > (Andrii) > > > v3: drop bpf_iter__mptcp_subflow > > > v4: if msk is NULL, initialize kit->msk to NULL in _new() and check > > > it in > > > _next() (Andrii) > > > > > > Suggested-by: Martin KaFai Lau <martin.lau@kernel.org> > > > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn> > > > --- > > > net/mptcp/bpf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++- > > > ---- > > > 1 file changed, 52 insertions(+), 5 deletions(-) > > > > > > > Looks ok from setting up open-coded iterator things, but I can't > > speak > > for other aspects I mentioned below. > > > > > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c > > > index 6414824402e6..fec18e7e5e4a 100644 > > > --- a/net/mptcp/bpf.c > > > +++ b/net/mptcp/bpf.c > > > @@ -201,9 +201,51 @@ static const struct btf_kfunc_id_set > > > bpf_mptcp_fmodret_set = { > > > .set = &bpf_mptcp_fmodret_ids, > > > }; > > > > > > -__diag_push(); > > > -__diag_ignore_all("-Wmissing-prototypes", > > > - "kfuncs which will be used in BPF programs"); > > > +struct bpf_iter_mptcp_subflow { > > > + __u64 __opaque[2]; > > > +} __attribute__((aligned(8))); > > > + > > > +struct bpf_iter_mptcp_subflow_kern { > > > + struct mptcp_sock *msk; > > > + struct list_head *pos; > > > +} __attribute__((aligned(8))); > > > + > > > +__bpf_kfunc_start_defs(); > > > + > > > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct > > > bpf_iter_mptcp_subflow *it, > > > + struct mptcp_sock *msk) > > > +{ > > > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; > > > + > > > + kit->msk = msk; > > > + if (!msk) > > > + return -EINVAL; > > > + > > > + kit->pos = &msk->conn_list; > > > + return 0; > > > +} > > > + > > > +__bpf_kfunc struct mptcp_subflow_context * > > > +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it) > > > +{ > > > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; > > > + struct mptcp_subflow_context *subflow; > > > + struct mptcp_sock *msk = kit->msk; > > > + > > > + if (!msk) > > > + return NULL; > > > + > > > + subflow = list_entry(kit->pos->next, struct > > > mptcp_subflow_context, node); > > > + if (!subflow || list_entry_is_head(subflow, &msk- > > > >conn_list, node)) > > > > it's a bit weird that you need both !subflow and > > list_entry_is_head(). > > Can you have NULL subflow at all? But this is the question to > > msk->conn_list semantics. > > Do you mean to return subflow regardless of whether subflow is NULL? If Can you have a NULL subflow in the middle of the list and some more items after that? If yes, then you should *skip* NULL subflow. But if the NULL subflow is always last, then do you even need list_entry_is_head() or list_is_last()? But ultimately, I have no idea what the data looks like, just double-check that the stopping condition is correct, that's all. Has nothing to do with BPF open-coded iterators. > so, we need to use list_is_last() instead of list_entry_is_head(): > > { > struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; > > if (!kit->msk || list_is_last(kit->pos, &kit->msk->conn_list)) > return NULL; > > kit->pos = kit->pos->next; > return list_entry(kit->pos, struct mptcp_subflow_context, node); > } > > Hope this is a better version. ok, works for me as well > > > > > > + return NULL; > > > + > > > + kit->pos = &subflow->node; > > > + return subflow; > > > +} > > > + > > > +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct > > > bpf_iter_mptcp_subflow *it) > > > +{ > > > +} > > > > > > __bpf_kfunc struct mptcp_subflow_context * > > > bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data, > > > unsigned int pos) > > > @@ -218,12 +260,15 @@ __bpf_kfunc bool > > > bpf_mptcp_subflow_queues_empty(struct sock *sk) > > > return tcp_rtx_queue_empty(sk); > > > } > > > > > > -__diag_pop(); > > > +__bpf_kfunc_end_defs(); > > > > > > BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids) > > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new) > > > > I'm not 100% sure, but I suspect you might need to specify > > KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is a > > KF_TRUSTED_ARGS breaks the selftest in patch 2 [1]: Please, for the next revision, CC bpf@vger.kernel.org on *all* patches in your series, so we don't have to search for the selftests in another mailing list. > > ''' > libbpf: prog 'trace_mptcp_sched_get_send': BPF program load failed: > Invalid argument > libbpf: prog 'trace_mptcp_sched_get_send': -- BEGIN PROG LOAD LOG -- > 0: R1=ctx() R10=fp0 > ; int BPF_PROG(trace_mptcp_sched_get_send, struct mptcp_sock *msk) @ > mptcp_bpf_iter.c:13 > 0: (79) r6 = *(u64 *)(r1 +0) > func 'mptcp_sched_get_send' arg0 has btf_id 28019 type STRUCT > 'mptcp_sock' > 1: R1=ctx() R6_w=ptr_mptcp_sock() > ; if (bpf_get_current_pid_tgid() >> 32 != pid) @ mptcp_bpf_iter.c:17 > 1: (85) call bpf_get_current_pid_tgid#14 ; R0_w=scalar() > 2: (77) r0 >>= 32 ; > R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) > 3: (18) r1 = 0xffffb766c1684004 ; > R1_w=map_value(map=mptcp_bp.bss,ks=4,vs=8,off=4) > 5: (61) r1 = *(u32 *)(r1 +0) ; > R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) > 6: (67) r1 <<= 32 ; > R1_w=scalar(smax=0x7fffffff00000000,umax=0xffffffff00000000,smin32=0,sm > ax32=umax32=0,var_off=(0x0; 0xffffffff00000000)) > 7: (c7) r1 s>>= 32 ; > R1_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff) > 8: (5d) if r0 != r1 goto pc+39 ; > R0_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; > 0x7fffffff)) > R1_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; > 0x7fffffff)) > ; iter = 0; @ mptcp_bpf_iter.c:20 > 9: (18) r8 = 0xffffb766c1684000 ; > R8_w=map_value(map=mptcp_bp.bss,ks=4,vs=8) > 11: (b4) w1 = 0 ; R1_w=0 > 12: (63) *(u32 *)(r8 +0) = r1 ; R1_w=0 > R8_w=map_value(map=mptcp_bp.bss,ks=4,vs=8) > ; bpf_rcu_read_lock(); @ mptcp_bpf_iter.c:22 > 13: (85) call bpf_rcu_read_lock#84967 ; > 14: (bf) r7 = r10 ; R7_w=fp0 R10=fp0 > ; iter = 0; @ mptcp_bpf_iter.c:20 > 15: (07) r7 += -16 ; R7_w=fp-16 > ; bpf_for_each(mptcp_subflow, subflow, msk) { @ mptcp_bpf_iter.c:23 > 16: (bf) r1 = r7 ; R1_w=fp-16 R7_w=fp-16 > 17: (bf) r2 = r6 ; R2_w=ptr_mptcp_sock() > R6=ptr_mptcp_sock() > 18: (85) call bpf_iter_mptcp_subflow_new#62694 > R2 must be referenced or trusted > processed 17 insns (limit 1000000) max_states_per_insn 0 total_states 1 > peak_states 1 mark_read 1 > -- END PROG LOAD LOG -- > libbpf: prog 'trace_mptcp_sched_get_send': failed to load: -22 > libbpf: failed to load object 'mptcp_bpf_iter' > libbpf: failed to load BPF skeleton 'mptcp_bpf_iter': -22 > test_bpf_iter:FAIL:skel_open_load: mptcp_iter unexpected error: -22 > #169/4 mptcp/bpf_iter:FAIL > ''' > > I don't know how to fix it yet. And the verifier is pointing out the real problem (selftest is at [0] for those interested). struct mptcp_sock *msk argument for fentry/mptcp_sched_get_send can't be trusted to be valid and be properly protected from being freed, etc. You need to have kfuncs that will have KF_ACQUIRE semantics and will take refcount or whatnot (or KF_RCU for RCU-protection). See some examples where we do the same with tasks or maybe sockets, I'm not very familiar with this area. [0] https://lore.kernel.org/mptcp/4467ab3af0f1a6c378778ca6be72753b16a1532c.1726132802.git.tanggeliang@kylinos.cn/ > > > valid owned kernel object. Other BPF folks might help to clarify > > this. > > > > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next) > > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy) > > > +BTF_ID_FLAGS(func, mptcp_subflow_active) > > But we do need to add KF_ITER_NEW/NEXT/DESTROY flags here: > > BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new, KF_ITER_NEW) > BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next, KF_ITER_NEXT | > KF_RET_NULL) > BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy, KF_ITER_DESTROY) > > With these flags, we can drop this additional test in loop: > > if (i++ >= MPTCP_SUBFLOWS_MAX) > break; > > > This problem has been bothering me for a while. I got "infinite loop > detected" errors when walking the list with > bpf_for_each(mptcp_subflow). So I had to use this additional test to > break it. > > With these KF_ITER_NEW/NEXT/DESTROY flags, no need to use this > additional test anymore. > These flags is what makes those functions an open-coded iterator implementation, yes. > Thanks, > -Geliang > > [1] > https://patchwork.kernel.org/project/mptcp/patch/4467ab3af0f1a6c378778ca6be72753b16a1532c.1726132802.git.tanggeliang@kylinos.cn/ > > > > BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled) > > > BTF_ID_FLAGS(func, bpf_mptcp_subflow_ctx_by_pos) > > > -BTF_ID_FLAGS(func, mptcp_subflow_active) > > > BTF_ID_FLAGS(func, mptcp_set_timeout) > > > BTF_ID_FLAGS(func, mptcp_wnd_end) > > > BTF_ID_FLAGS(func, tcp_stream_memory_free) > > > @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void) > > > int ret; > > > > > > ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); > > > + ret = ret ?: > > > register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, > > > + > > > &bpf_mptcp_sched_kfunc_set); > > > ret = ret ?: > > > register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, > > > > > > &bpf_mptcp_sched_kfunc_set); > > > #ifdef CONFIG_BPF_JIT > > > -- > > > 2.43.0 > > > > > > >
On 9/13/24 1:57 PM, Andrii Nakryiko wrote: >>>> +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct >>>> bpf_iter_mptcp_subflow *it, >>>> + struct mptcp_sock *msk) >>>> +{ >>>> + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; >>>> + >>>> + kit->msk = msk; >>>> + if (!msk) >>>> + return -EINVAL; >>>> + >>>> + kit->pos = &msk->conn_list; >>>> + return 0; >>>> +} [ ... ] >>>> BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids) >>>> +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new) >>> >>> I'm not 100% sure, but I suspect you might need to specify >>> KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is a +1 >>>> @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void) >>>> int ret; >>>> >>>> ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); >>>> + ret = ret ?: >>>> register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, >>>> + >>>> &bpf_mptcp_sched_kfunc_set); This cannot be used in tracing. Going back to my earlier question in v1. How is the msk->conn_list protected?
Hi Martin, Andrii, Matt, On Fri, 2024-09-13 at 17:41 -0700, Martin KaFai Lau wrote: > On 9/13/24 1:57 PM, Andrii Nakryiko wrote: > > > > > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct > > > > > bpf_iter_mptcp_subflow *it, > > > > > + struct mptcp_sock > > > > > *msk) > > > > > +{ > > > > > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; > > > > > + > > > > > + kit->msk = msk; > > > > > + if (!msk) > > > > > + return -EINVAL; > > > > > + > > > > > + kit->pos = &msk->conn_list; > > > > > + return 0; > > > > > +} > > [ ... ] > > > > > > BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids) > > > > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new) > > > > > > > > I'm not 100% sure, but I suspect you might need to specify > > > > KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is > > > > a > > +1 So we must add KF_TRUSTED_ARGS flag, right? > > > > > > @@ -241,6 +286,8 @@ static int __init > > > > > bpf_mptcp_kfunc_init(void) > > > > > int ret; > > > > > > > > > > ret = > > > > > register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); > > > > > + ret = ret ?: > > > > > register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, > > > > > + > > > > > &bpf_mptcp_sched_kfunc_set); > > This cannot be used in tracing. Actually, we don’t need to use mptcp_subflow bpf_iter in tracing. We plan to use it in MPTCP BPF packet schedulers, which are not tracing, but "struct_ops" types. And they work well with KF_TRUSTED_ARGS flag in bpf_iter_mptcp_subflow_new: BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new, KF_ITER_NEW | KF_TRUSTED_ARGS); An example of the scheduler is: SEC("struct_ops") int BPF_PROG(bpf_first_get_subflow, struct mptcp_sock *msk, struct mptcp_sched_data *data) { struct mptcp_subflow_context *subflow; bpf_rcu_read_lock(); bpf_for_each(mptcp_subflow, subflow, msk) { mptcp_subflow_set_scheduled(subflow, true); break; } bpf_rcu_read_unlock(); return 0; } SEC(".struct_ops") struct mptcp_sched_ops first = { .init = (void *)mptcp_sched_first_init, .release = (void *)mptcp_sched_first_release, .get_subflow = (void *)bpf_first_get_subflow, .name = "bpf_first", }; But BPF mptcp_sched_ops code has not been merged into bpf-next yet, so I simply test this bpf_for_each(mptcp_subflow) in tracing since I noticed other bpf_iter selftests are using tracing too: progs/iters_task.c SEC("fentry.s/" SYS_PREFIX "sys_getpgid") progs/iters_css.c SEC("fentry.s/" SYS_PREFIX "sys_getpgid") If this bpf_for_each(mptcp_subflow) can only be used in struct_ops, I will try to move the selftest into a struct_ops. > > Going back to my earlier question in v1. How is the msk->conn_list > protected? > msk->conn_list is protected by msk socket lock. (@Matt, am I right?) We use this in kernel code: struct sock *sk = (struct sock *)msk; lock_sock(sk); kfunc(&msk->conn_list); release_sock(sk); If so, should we also use lock_sock/release_sock in bpf_iter_mptcp_subflow_next()? Thanks, -Geliang
On Sat, 2024-09-14 at 16:40 +0800, Geliang Tang wrote: > Hi Martin, Andrii, Matt, > > On Fri, 2024-09-13 at 17:41 -0700, Martin KaFai Lau wrote: > > On 9/13/24 1:57 PM, Andrii Nakryiko wrote: > > > > > > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct > > > > > > bpf_iter_mptcp_subflow *it, > > > > > > + struct > > > > > > mptcp_sock > > > > > > *msk) > > > > > > +{ > > > > > > + struct bpf_iter_mptcp_subflow_kern *kit = (void > > > > > > *)it; > > > > > > + > > > > > > + kit->msk = msk; > > > > > > + if (!msk) > > > > > > + return -EINVAL; > > > > > > + > > > > > > + kit->pos = &msk->conn_list; > > > > > > + return 0; > > > > > > +} > > > > [ ... ] > > > > > > > > BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids) > > > > > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new) > > > > > > > > > > I'm not 100% sure, but I suspect you might need to specify > > > > > KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` > > > > > is > > > > > a > > > > +1 > > So we must add KF_TRUSTED_ARGS flag, right? > > > > > > > > > @@ -241,6 +286,8 @@ static int __init > > > > > > bpf_mptcp_kfunc_init(void) > > > > > > int ret; > > > > > > > > > > > > ret = > > > > > > register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); > > > > > > + ret = ret ?: > > > > > > register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, > > > > > > + > > > > > > &bpf_mptcp_sched_kfunc_set); > > > > This cannot be used in tracing. > > Actually, we don’t need to use mptcp_subflow bpf_iter in tracing. > > We plan to use it in MPTCP BPF packet schedulers, which are not > tracing, but "struct_ops" types. And they work well with > KF_TRUSTED_ARGS flag in bpf_iter_mptcp_subflow_new: > > BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new, KF_ITER_NEW | > KF_TRUSTED_ARGS); > > An example of the scheduler is: > > SEC("struct_ops") > int BPF_PROG(bpf_first_get_subflow, struct mptcp_sock *msk, > struct mptcp_sched_data *data) > { > struct mptcp_subflow_context *subflow; > > bpf_rcu_read_lock(); > bpf_for_each(mptcp_subflow, subflow, msk) { > mptcp_subflow_set_scheduled(subflow, true); > break; > } > bpf_rcu_read_unlock(); > > return 0; > } > > SEC(".struct_ops") > struct mptcp_sched_ops first = { > .init = (void *)mptcp_sched_first_init, > .release = (void *)mptcp_sched_first_release, > .get_subflow = (void *)bpf_first_get_subflow, > .name = "bpf_first", > }; > > But BPF mptcp_sched_ops code has not been merged into bpf-next yet, > so > I simply test this bpf_for_each(mptcp_subflow) in tracing since I > noticed other bpf_iter selftests are using tracing too: > > progs/iters_task.c > SEC("fentry.s/" SYS_PREFIX "sys_getpgid") > > progs/iters_css.c > SEC("fentry.s/" SYS_PREFIX "sys_getpgid") > > If this bpf_for_each(mptcp_subflow) can only be used in struct_ops, I > will try to move the selftest into a struct_ops. > > > > > Going back to my earlier question in v1. How is the msk->conn_list > > protected? > > > > msk->conn_list is protected by msk socket lock. (@Matt, am I right?) > We > use this in kernel code: > > struct sock *sk = (struct sock *)msk; > > lock_sock(sk); > kfunc(&msk->conn_list); > release_sock(sk); > > If so, should we also use lock_sock/release_sock in > bpf_iter_mptcp_subflow_next()? bpf_for_each(mptcp_subflow) is expected to be used in the get_subflow() interface of mptcp_sched_ops to traverse all subflows and then pick one subflow to send data. This interface is invoked in mptcp_sched_get_send(), here the msk socket lock is hold already: int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct mptcp_sched_data data; msk_owned_by_me(msk); ... ... mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } data.reinject = false; if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_subflow(msk, &data); return msk->sched->get_subflow(msk, &data); } So no need to hold msk socket lock again in BPF schedulers. This means we can walk the conn_list without any lock. bpf_rcu_read_lock() and bpf_rcu_read_unlock() can be dropped in BPF schedulers too. Am I right? Thanks, -Geliang
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c index 6414824402e6..fec18e7e5e4a 100644 --- a/net/mptcp/bpf.c +++ b/net/mptcp/bpf.c @@ -201,9 +201,51 @@ static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = { .set = &bpf_mptcp_fmodret_ids, }; -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "kfuncs which will be used in BPF programs"); +struct bpf_iter_mptcp_subflow { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + +struct bpf_iter_mptcp_subflow_kern { + struct mptcp_sock *msk; + struct list_head *pos; +} __attribute__((aligned(8))); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct bpf_iter_mptcp_subflow *it, + struct mptcp_sock *msk) +{ + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; + + kit->msk = msk; + if (!msk) + return -EINVAL; + + kit->pos = &msk->conn_list; + return 0; +} + +__bpf_kfunc struct mptcp_subflow_context * +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it) +{ + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it; + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = kit->msk; + + if (!msk) + return NULL; + + subflow = list_entry(kit->pos->next, struct mptcp_subflow_context, node); + if (!subflow || list_entry_is_head(subflow, &msk->conn_list, node)) + return NULL; + + kit->pos = &subflow->node; + return subflow; +} + +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct bpf_iter_mptcp_subflow *it) +{ +} __bpf_kfunc struct mptcp_subflow_context * bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data, unsigned int pos) @@ -218,12 +260,15 @@ __bpf_kfunc bool bpf_mptcp_subflow_queues_empty(struct sock *sk) return tcp_rtx_queue_empty(sk); } -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids) +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new) +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next) +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy) +BTF_ID_FLAGS(func, mptcp_subflow_active) BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled) BTF_ID_FLAGS(func, bpf_mptcp_subflow_ctx_by_pos) -BTF_ID_FLAGS(func, mptcp_subflow_active) BTF_ID_FLAGS(func, mptcp_set_timeout) BTF_ID_FLAGS(func, mptcp_wnd_end) BTF_ID_FLAGS(func, tcp_stream_memory_free) @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void) int ret; ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_mptcp_sched_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_mptcp_sched_kfunc_set); #ifdef CONFIG_BPF_JIT