diff mbox series

[mptcp-next,v5,1/5] bpf: Add mptcp_subflow bpf_iter

Message ID 5e5b91efc6e06a90fb4d2440ddcbe9b55ee464be.1726132802.git.tanggeliang@kylinos.cn (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series [mptcp-next,v5,1/5] bpf: Add mptcp_subflow bpf_iter | expand

Commit Message

Geliang Tang Sept. 12, 2024, 9:25 a.m. UTC
From: Geliang Tang <tanggeliang@kylinos.cn>

It's necessary to traverse all subflows on the conn_list of an MPTCP
socket and then call kfunc to modify the fields of each subflow. In
kernel space, mptcp_for_each_subflow() helper is used for this:

	mptcp_for_each_subflow(msk, subflow)
		kfunc(subflow);

But in the MPTCP BPF program, this has not yet been implemented. As
Martin suggested recently, this conn_list walking + modify-by-kfunc
usage fits the bpf_iter use case. So this patch adds a new bpf_iter
type named "mptcp_subflow" to do this and implements its helpers
bpf_iter_mptcp_subflow_new()/_next()/_destroy().

Since these bpf_iter mptcp_subflow helpers are invoked in its selftest
in a ftrace hook for mptcp_sched_get_send(), it's necessary to register
them into a BPF_PROG_TYPE_TRACING type kfunc set together with other
two used kfuncs mptcp_subflow_active() and mptcp_subflow_set_scheduled().

Then bpf_for_each() for mptcp_subflow can be used in BPF program like
this:

	i = 0;
	bpf_rcu_read_lock();
	bpf_for_each(mptcp_subflow, subflow, msk) {
		if (i++ >= MPTCP_SUBFLOWS_MAX)
			break;
		kfunc(subflow);
	}
	bpf_rcu_read_unlock();

v2: remove msk->pm.lock in _new() and _destroy() (Martin)
    drop DEFINE_BPF_ITER_FUNC, change opaque[3] to opaque[2] (Andrii)
v3: drop bpf_iter__mptcp_subflow
v4: if msk is NULL, initialize kit->msk to NULL in _new() and check it in
    _next() (Andrii)

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/bpf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 5 deletions(-)

Comments

Andrii Nakryiko Sept. 12, 2024, 6:24 p.m. UTC | #1
On Thu, Sep 12, 2024 at 2:26 AM Geliang Tang <geliang@kernel.org> wrote:
>
> From: Geliang Tang <tanggeliang@kylinos.cn>
>
> It's necessary to traverse all subflows on the conn_list of an MPTCP
> socket and then call kfunc to modify the fields of each subflow. In
> kernel space, mptcp_for_each_subflow() helper is used for this:
>
>         mptcp_for_each_subflow(msk, subflow)
>                 kfunc(subflow);
>
> But in the MPTCP BPF program, this has not yet been implemented. As
> Martin suggested recently, this conn_list walking + modify-by-kfunc
> usage fits the bpf_iter use case. So this patch adds a new bpf_iter
> type named "mptcp_subflow" to do this and implements its helpers
> bpf_iter_mptcp_subflow_new()/_next()/_destroy().
>
> Since these bpf_iter mptcp_subflow helpers are invoked in its selftest
> in a ftrace hook for mptcp_sched_get_send(), it's necessary to register
> them into a BPF_PROG_TYPE_TRACING type kfunc set together with other
> two used kfuncs mptcp_subflow_active() and mptcp_subflow_set_scheduled().
>
> Then bpf_for_each() for mptcp_subflow can be used in BPF program like
> this:
>
>         i = 0;
>         bpf_rcu_read_lock();
>         bpf_for_each(mptcp_subflow, subflow, msk) {
>                 if (i++ >= MPTCP_SUBFLOWS_MAX)
>                         break;
>                 kfunc(subflow);
>         }
>         bpf_rcu_read_unlock();
>
> v2: remove msk->pm.lock in _new() and _destroy() (Martin)
>     drop DEFINE_BPF_ITER_FUNC, change opaque[3] to opaque[2] (Andrii)
> v3: drop bpf_iter__mptcp_subflow
> v4: if msk is NULL, initialize kit->msk to NULL in _new() and check it in
>     _next() (Andrii)
>
> Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> ---
>  net/mptcp/bpf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 52 insertions(+), 5 deletions(-)
>

Looks ok from setting up open-coded iterator things, but I can't speak
for other aspects I mentioned below.

> diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
> index 6414824402e6..fec18e7e5e4a 100644
> --- a/net/mptcp/bpf.c
> +++ b/net/mptcp/bpf.c
> @@ -201,9 +201,51 @@ static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = {
>         .set   = &bpf_mptcp_fmodret_ids,
>  };
>
> -__diag_push();
> -__diag_ignore_all("-Wmissing-prototypes",
> -                 "kfuncs which will be used in BPF programs");
> +struct bpf_iter_mptcp_subflow {
> +       __u64 __opaque[2];
> +} __attribute__((aligned(8)));
> +
> +struct bpf_iter_mptcp_subflow_kern {
> +       struct mptcp_sock *msk;
> +       struct list_head *pos;
> +} __attribute__((aligned(8)));
> +
> +__bpf_kfunc_start_defs();
> +
> +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct bpf_iter_mptcp_subflow *it,
> +                                          struct mptcp_sock *msk)
> +{
> +       struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> +
> +       kit->msk = msk;
> +       if (!msk)
> +               return -EINVAL;
> +
> +       kit->pos = &msk->conn_list;
> +       return 0;
> +}
> +
> +__bpf_kfunc struct mptcp_subflow_context *
> +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it)
> +{
> +       struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> +       struct mptcp_subflow_context *subflow;
> +       struct mptcp_sock *msk = kit->msk;
> +
> +       if (!msk)
> +               return NULL;
> +
> +       subflow = list_entry(kit->pos->next, struct mptcp_subflow_context, node);
> +       if (!subflow || list_entry_is_head(subflow, &msk->conn_list, node))

it's a bit weird that you need both !subflow and list_entry_is_head().
Can you have NULL subflow at all? But this is the question to
msk->conn_list semantics.

> +               return NULL;
> +
> +       kit->pos = &subflow->node;
> +       return subflow;
> +}
> +
> +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct bpf_iter_mptcp_subflow *it)
> +{
> +}
>
>  __bpf_kfunc struct mptcp_subflow_context *
>  bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data, unsigned int pos)
> @@ -218,12 +260,15 @@ __bpf_kfunc bool bpf_mptcp_subflow_queues_empty(struct sock *sk)
>         return tcp_rtx_queue_empty(sk);
>  }
>
> -__diag_pop();
> +__bpf_kfunc_end_defs();
>
>  BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
> +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new)

I'm not 100% sure, but I suspect you might need to specify
KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is a
valid owned kernel object. Other BPF folks might help to clarify this.

> +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next)
> +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy)
> +BTF_ID_FLAGS(func, mptcp_subflow_active)
>  BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled)
>  BTF_ID_FLAGS(func, bpf_mptcp_subflow_ctx_by_pos)
> -BTF_ID_FLAGS(func, mptcp_subflow_active)
>  BTF_ID_FLAGS(func, mptcp_set_timeout)
>  BTF_ID_FLAGS(func, mptcp_wnd_end)
>  BTF_ID_FLAGS(func, tcp_stream_memory_free)
> @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void)
>         int ret;
>
>         ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
> +       ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
> +                                              &bpf_mptcp_sched_kfunc_set);
>         ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
>                                                &bpf_mptcp_sched_kfunc_set);
>  #ifdef CONFIG_BPF_JIT
> --
> 2.43.0
>
>
Geliang Tang Sept. 13, 2024, 4:04 a.m. UTC | #2
Hi Andrii,

On Thu, 2024-09-12 at 11:24 -0700, Andrii Nakryiko wrote:
> On Thu, Sep 12, 2024 at 2:26 AM Geliang Tang <geliang@kernel.org>
> wrote:
> > 
> > From: Geliang Tang <tanggeliang@kylinos.cn>
> > 
> > It's necessary to traverse all subflows on the conn_list of an
> > MPTCP
> > socket and then call kfunc to modify the fields of each subflow. In
> > kernel space, mptcp_for_each_subflow() helper is used for this:
> > 
> >         mptcp_for_each_subflow(msk, subflow)
> >                 kfunc(subflow);
> > 
> > But in the MPTCP BPF program, this has not yet been implemented. As
> > Martin suggested recently, this conn_list walking + modify-by-kfunc
> > usage fits the bpf_iter use case. So this patch adds a new bpf_iter
> > type named "mptcp_subflow" to do this and implements its helpers
> > bpf_iter_mptcp_subflow_new()/_next()/_destroy().
> > 
> > Since these bpf_iter mptcp_subflow helpers are invoked in its
> > selftest
> > in a ftrace hook for mptcp_sched_get_send(), it's necessary to
> > register
> > them into a BPF_PROG_TYPE_TRACING type kfunc set together with
> > other
> > two used kfuncs mptcp_subflow_active() and
> > mptcp_subflow_set_scheduled().
> > 
> > Then bpf_for_each() for mptcp_subflow can be used in BPF program
> > like
> > this:
> > 
> >         i = 0;
> >         bpf_rcu_read_lock();
> >         bpf_for_each(mptcp_subflow, subflow, msk) {
> >                 if (i++ >= MPTCP_SUBFLOWS_MAX)
> >                         break;
> >                 kfunc(subflow);
> >         }
> >         bpf_rcu_read_unlock();
> > 
> > v2: remove msk->pm.lock in _new() and _destroy() (Martin)
> >     drop DEFINE_BPF_ITER_FUNC, change opaque[3] to opaque[2]
> > (Andrii)
> > v3: drop bpf_iter__mptcp_subflow
> > v4: if msk is NULL, initialize kit->msk to NULL in _new() and check
> > it in
> >     _next() (Andrii)
> > 
> > Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
> > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> > ---
> >  net/mptcp/bpf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++-
> > ----
> >  1 file changed, 52 insertions(+), 5 deletions(-)
> > 
> 
> Looks ok from setting up open-coded iterator things, but I can't
> speak
> for other aspects I mentioned below.
> 
> > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
> > index 6414824402e6..fec18e7e5e4a 100644
> > --- a/net/mptcp/bpf.c
> > +++ b/net/mptcp/bpf.c
> > @@ -201,9 +201,51 @@ static const struct btf_kfunc_id_set
> > bpf_mptcp_fmodret_set = {
> >         .set   = &bpf_mptcp_fmodret_ids,
> >  };
> > 
> > -__diag_push();
> > -__diag_ignore_all("-Wmissing-prototypes",
> > -                 "kfuncs which will be used in BPF programs");
> > +struct bpf_iter_mptcp_subflow {
> > +       __u64 __opaque[2];
> > +} __attribute__((aligned(8)));
> > +
> > +struct bpf_iter_mptcp_subflow_kern {
> > +       struct mptcp_sock *msk;
> > +       struct list_head *pos;
> > +} __attribute__((aligned(8)));
> > +
> > +__bpf_kfunc_start_defs();
> > +
> > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct
> > bpf_iter_mptcp_subflow *it,
> > +                                          struct mptcp_sock *msk)
> > +{
> > +       struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> > +
> > +       kit->msk = msk;
> > +       if (!msk)
> > +               return -EINVAL;
> > +
> > +       kit->pos = &msk->conn_list;
> > +       return 0;
> > +}
> > +
> > +__bpf_kfunc struct mptcp_subflow_context *
> > +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it)
> > +{
> > +       struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> > +       struct mptcp_subflow_context *subflow;
> > +       struct mptcp_sock *msk = kit->msk;
> > +
> > +       if (!msk)
> > +               return NULL;
> > +
> > +       subflow = list_entry(kit->pos->next, struct
> > mptcp_subflow_context, node);
> > +       if (!subflow || list_entry_is_head(subflow, &msk-
> > >conn_list, node))
> 
> it's a bit weird that you need both !subflow and
> list_entry_is_head().
> Can you have NULL subflow at all? But this is the question to
> msk->conn_list semantics.

Do you mean to return subflow regardless of whether subflow is NULL? If
so, we need to use list_is_last() instead of list_entry_is_head():

{
    struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;

    if (!kit->msk || list_is_last(kit->pos, &kit->msk->conn_list))
            return NULL;

    kit->pos = kit->pos->next;
    return list_entry(kit->pos, struct mptcp_subflow_context, node);
}

Hope this is a better version.

> 
> > +               return NULL;
> > +
> > +       kit->pos = &subflow->node;
> > +       return subflow;
> > +}
> > +
> > +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct
> > bpf_iter_mptcp_subflow *it)
> > +{
> > +}
> > 
> >  __bpf_kfunc struct mptcp_subflow_context *
> >  bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data,
> > unsigned int pos)
> > @@ -218,12 +260,15 @@ __bpf_kfunc bool
> > bpf_mptcp_subflow_queues_empty(struct sock *sk)
> >         return tcp_rtx_queue_empty(sk);
> >  }
> > 
> > -__diag_pop();
> > +__bpf_kfunc_end_defs();
> > 
> >  BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
> > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new)
> 
> I'm not 100% sure, but I suspect you might need to specify
> KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is a

KF_TRUSTED_ARGS breaks the selftest in patch 2 [1]:

'''
libbpf: prog 'trace_mptcp_sched_get_send': BPF program load failed:
Invalid argument
libbpf: prog 'trace_mptcp_sched_get_send': -- BEGIN PROG LOAD LOG --
0: R1=ctx() R10=fp0
; int BPF_PROG(trace_mptcp_sched_get_send, struct mptcp_sock *msk) @
mptcp_bpf_iter.c:13
0: (79) r6 = *(u64 *)(r1 +0)
func 'mptcp_sched_get_send' arg0 has btf_id 28019 type STRUCT
'mptcp_sock'
1: R1=ctx() R6_w=ptr_mptcp_sock()
; if (bpf_get_current_pid_tgid() >> 32 != pid) @ mptcp_bpf_iter.c:17
1: (85) call bpf_get_current_pid_tgid#14      ; R0_w=scalar()
2: (77) r0 >>= 32                     ;
R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
3: (18) r1 = 0xffffb766c1684004       ;
R1_w=map_value(map=mptcp_bp.bss,ks=4,vs=8,off=4)
5: (61) r1 = *(u32 *)(r1 +0)          ;
R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
6: (67) r1 <<= 32                     ;
R1_w=scalar(smax=0x7fffffff00000000,umax=0xffffffff00000000,smin32=0,sm
ax32=umax32=0,var_off=(0x0; 0xffffffff00000000))
7: (c7) r1 s>>= 32                    ;
R1_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff)
8: (5d) if r0 != r1 goto pc+39        ;
R0_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0;
0x7fffffff))
R1_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0;
0x7fffffff))
; iter = 0; @ mptcp_bpf_iter.c:20
9: (18) r8 = 0xffffb766c1684000       ;
R8_w=map_value(map=mptcp_bp.bss,ks=4,vs=8)
11: (b4) w1 = 0                       ; R1_w=0
12: (63) *(u32 *)(r8 +0) = r1         ; R1_w=0
R8_w=map_value(map=mptcp_bp.bss,ks=4,vs=8)
; bpf_rcu_read_lock(); @ mptcp_bpf_iter.c:22
13: (85) call bpf_rcu_read_lock#84967         ;
14: (bf) r7 = r10                     ; R7_w=fp0 R10=fp0
; iter = 0; @ mptcp_bpf_iter.c:20
15: (07) r7 += -16                    ; R7_w=fp-16
; bpf_for_each(mptcp_subflow, subflow, msk) { @ mptcp_bpf_iter.c:23
16: (bf) r1 = r7                      ; R1_w=fp-16 R7_w=fp-16
17: (bf) r2 = r6                      ; R2_w=ptr_mptcp_sock()
R6=ptr_mptcp_sock()
18: (85) call bpf_iter_mptcp_subflow_new#62694
R2 must be referenced or trusted
processed 17 insns (limit 1000000) max_states_per_insn 0 total_states 1
peak_states 1 mark_read 1
-- END PROG LOAD LOG --
libbpf: prog 'trace_mptcp_sched_get_send': failed to load: -22
libbpf: failed to load object 'mptcp_bpf_iter'
libbpf: failed to load BPF skeleton 'mptcp_bpf_iter': -22
test_bpf_iter:FAIL:skel_open_load: mptcp_iter unexpected error: -22
#169/4   mptcp/bpf_iter:FAIL
'''

I don't know how to fix it yet.

> valid owned kernel object. Other BPF folks might help to clarify
> this.
> 
> > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next)
> > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy)
> > +BTF_ID_FLAGS(func, mptcp_subflow_active)

But we do need to add KF_ITER_NEW/NEXT/DESTROY flags here:

BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next, KF_ITER_NEXT |
KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy, KF_ITER_DESTROY)

With these flags, we can drop this additional test in loop:

	if (i++ >= MPTCP_SUBFLOWS_MAX)
		break;


This problem has been bothering me for a while. I got "infinite loop
detected" errors when walking the list with
bpf_for_each(mptcp_subflow). So I had to use this additional test to
break it.

With these KF_ITER_NEW/NEXT/DESTROY flags, no need to use this
additional test anymore.

Thanks,
-Geliang

[1]
https://patchwork.kernel.org/project/mptcp/patch/4467ab3af0f1a6c378778ca6be72753b16a1532c.1726132802.git.tanggeliang@kylinos.cn/

> >  BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled)
> >  BTF_ID_FLAGS(func, bpf_mptcp_subflow_ctx_by_pos)
> > -BTF_ID_FLAGS(func, mptcp_subflow_active)
> >  BTF_ID_FLAGS(func, mptcp_set_timeout)
> >  BTF_ID_FLAGS(func, mptcp_wnd_end)
> >  BTF_ID_FLAGS(func, tcp_stream_memory_free)
> > @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void)
> >         int ret;
> > 
> >         ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
> > +       ret = ret ?:
> > register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
> > +                                             
> > &bpf_mptcp_sched_kfunc_set);
> >         ret = ret ?:
> > register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
> >                                               
> > &bpf_mptcp_sched_kfunc_set);
> >  #ifdef CONFIG_BPF_JIT
> > --
> > 2.43.0
> > 
> >
Andrii Nakryiko Sept. 13, 2024, 8:57 p.m. UTC | #3
On Thu, Sep 12, 2024 at 9:04 PM Geliang Tang <geliang@kernel.org> wrote:
>
> Hi Andrii,
>
> On Thu, 2024-09-12 at 11:24 -0700, Andrii Nakryiko wrote:
> > On Thu, Sep 12, 2024 at 2:26 AM Geliang Tang <geliang@kernel.org>
> > wrote:
> > >
> > > From: Geliang Tang <tanggeliang@kylinos.cn>
> > >
> > > It's necessary to traverse all subflows on the conn_list of an
> > > MPTCP
> > > socket and then call kfunc to modify the fields of each subflow. In
> > > kernel space, mptcp_for_each_subflow() helper is used for this:
> > >
> > >         mptcp_for_each_subflow(msk, subflow)
> > >                 kfunc(subflow);
> > >
> > > But in the MPTCP BPF program, this has not yet been implemented. As
> > > Martin suggested recently, this conn_list walking + modify-by-kfunc
> > > usage fits the bpf_iter use case. So this patch adds a new bpf_iter
> > > type named "mptcp_subflow" to do this and implements its helpers
> > > bpf_iter_mptcp_subflow_new()/_next()/_destroy().
> > >
> > > Since these bpf_iter mptcp_subflow helpers are invoked in its
> > > selftest
> > > in a ftrace hook for mptcp_sched_get_send(), it's necessary to
> > > register
> > > them into a BPF_PROG_TYPE_TRACING type kfunc set together with
> > > other
> > > two used kfuncs mptcp_subflow_active() and
> > > mptcp_subflow_set_scheduled().
> > >
> > > Then bpf_for_each() for mptcp_subflow can be used in BPF program
> > > like
> > > this:
> > >
> > >         i = 0;
> > >         bpf_rcu_read_lock();
> > >         bpf_for_each(mptcp_subflow, subflow, msk) {
> > >                 if (i++ >= MPTCP_SUBFLOWS_MAX)
> > >                         break;
> > >                 kfunc(subflow);
> > >         }
> > >         bpf_rcu_read_unlock();
> > >
> > > v2: remove msk->pm.lock in _new() and _destroy() (Martin)
> > >     drop DEFINE_BPF_ITER_FUNC, change opaque[3] to opaque[2]
> > > (Andrii)
> > > v3: drop bpf_iter__mptcp_subflow
> > > v4: if msk is NULL, initialize kit->msk to NULL in _new() and check
> > > it in
> > >     _next() (Andrii)
> > >
> > > Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
> > > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> > > ---
> > >  net/mptcp/bpf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++-
> > > ----
> > >  1 file changed, 52 insertions(+), 5 deletions(-)
> > >
> >
> > Looks ok from setting up open-coded iterator things, but I can't
> > speak
> > for other aspects I mentioned below.
> >
> > > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
> > > index 6414824402e6..fec18e7e5e4a 100644
> > > --- a/net/mptcp/bpf.c
> > > +++ b/net/mptcp/bpf.c
> > > @@ -201,9 +201,51 @@ static const struct btf_kfunc_id_set
> > > bpf_mptcp_fmodret_set = {
> > >         .set   = &bpf_mptcp_fmodret_ids,
> > >  };
> > >
> > > -__diag_push();
> > > -__diag_ignore_all("-Wmissing-prototypes",
> > > -                 "kfuncs which will be used in BPF programs");
> > > +struct bpf_iter_mptcp_subflow {
> > > +       __u64 __opaque[2];
> > > +} __attribute__((aligned(8)));
> > > +
> > > +struct bpf_iter_mptcp_subflow_kern {
> > > +       struct mptcp_sock *msk;
> > > +       struct list_head *pos;
> > > +} __attribute__((aligned(8)));
> > > +
> > > +__bpf_kfunc_start_defs();
> > > +
> > > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct
> > > bpf_iter_mptcp_subflow *it,
> > > +                                          struct mptcp_sock *msk)
> > > +{
> > > +       struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> > > +
> > > +       kit->msk = msk;
> > > +       if (!msk)
> > > +               return -EINVAL;
> > > +
> > > +       kit->pos = &msk->conn_list;
> > > +       return 0;
> > > +}
> > > +
> > > +__bpf_kfunc struct mptcp_subflow_context *
> > > +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it)
> > > +{
> > > +       struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> > > +       struct mptcp_subflow_context *subflow;
> > > +       struct mptcp_sock *msk = kit->msk;
> > > +
> > > +       if (!msk)
> > > +               return NULL;
> > > +
> > > +       subflow = list_entry(kit->pos->next, struct
> > > mptcp_subflow_context, node);
> > > +       if (!subflow || list_entry_is_head(subflow, &msk-
> > > >conn_list, node))
> >
> > it's a bit weird that you need both !subflow and
> > list_entry_is_head().
> > Can you have NULL subflow at all? But this is the question to
> > msk->conn_list semantics.
>
> Do you mean to return subflow regardless of whether subflow is NULL? If

Can you have a NULL subflow in the middle of the list and some more
items after that? If yes, then you should *skip* NULL subflow.

But if the NULL subflow is always last, then do you even need
list_entry_is_head() or list_is_last()?

But ultimately, I have no idea what the data looks like, just
double-check that the stopping condition is correct, that's all. Has
nothing to do with BPF open-coded iterators.

> so, we need to use list_is_last() instead of list_entry_is_head():
>
> {
>     struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
>
>     if (!kit->msk || list_is_last(kit->pos, &kit->msk->conn_list))
>             return NULL;
>
>     kit->pos = kit->pos->next;
>     return list_entry(kit->pos, struct mptcp_subflow_context, node);
> }
>
> Hope this is a better version.

ok, works for me as well

>
> >
> > > +               return NULL;
> > > +
> > > +       kit->pos = &subflow->node;
> > > +       return subflow;
> > > +}
> > > +
> > > +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct
> > > bpf_iter_mptcp_subflow *it)
> > > +{
> > > +}
> > >
> > >  __bpf_kfunc struct mptcp_subflow_context *
> > >  bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data,
> > > unsigned int pos)
> > > @@ -218,12 +260,15 @@ __bpf_kfunc bool
> > > bpf_mptcp_subflow_queues_empty(struct sock *sk)
> > >         return tcp_rtx_queue_empty(sk);
> > >  }
> > >
> > > -__diag_pop();
> > > +__bpf_kfunc_end_defs();
> > >
> > >  BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
> > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new)
> >
> > I'm not 100% sure, but I suspect you might need to specify
> > KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is a
>
> KF_TRUSTED_ARGS breaks the selftest in patch 2 [1]:

Please, for the next revision, CC bpf@vger.kernel.org on *all* patches
in your series, so we don't have to search for the selftests in
another mailing list.

>
> '''
> libbpf: prog 'trace_mptcp_sched_get_send': BPF program load failed:
> Invalid argument
> libbpf: prog 'trace_mptcp_sched_get_send': -- BEGIN PROG LOAD LOG --
> 0: R1=ctx() R10=fp0
> ; int BPF_PROG(trace_mptcp_sched_get_send, struct mptcp_sock *msk) @
> mptcp_bpf_iter.c:13
> 0: (79) r6 = *(u64 *)(r1 +0)
> func 'mptcp_sched_get_send' arg0 has btf_id 28019 type STRUCT
> 'mptcp_sock'
> 1: R1=ctx() R6_w=ptr_mptcp_sock()
> ; if (bpf_get_current_pid_tgid() >> 32 != pid) @ mptcp_bpf_iter.c:17
> 1: (85) call bpf_get_current_pid_tgid#14      ; R0_w=scalar()
> 2: (77) r0 >>= 32                     ;
> R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
> 3: (18) r1 = 0xffffb766c1684004       ;
> R1_w=map_value(map=mptcp_bp.bss,ks=4,vs=8,off=4)
> 5: (61) r1 = *(u32 *)(r1 +0)          ;
> R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
> 6: (67) r1 <<= 32                     ;
> R1_w=scalar(smax=0x7fffffff00000000,umax=0xffffffff00000000,smin32=0,sm
> ax32=umax32=0,var_off=(0x0; 0xffffffff00000000))
> 7: (c7) r1 s>>= 32                    ;
> R1_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff)
> 8: (5d) if r0 != r1 goto pc+39        ;
> R0_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0;
> 0x7fffffff))
> R1_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0;
> 0x7fffffff))
> ; iter = 0; @ mptcp_bpf_iter.c:20
> 9: (18) r8 = 0xffffb766c1684000       ;
> R8_w=map_value(map=mptcp_bp.bss,ks=4,vs=8)
> 11: (b4) w1 = 0                       ; R1_w=0
> 12: (63) *(u32 *)(r8 +0) = r1         ; R1_w=0
> R8_w=map_value(map=mptcp_bp.bss,ks=4,vs=8)
> ; bpf_rcu_read_lock(); @ mptcp_bpf_iter.c:22
> 13: (85) call bpf_rcu_read_lock#84967         ;
> 14: (bf) r7 = r10                     ; R7_w=fp0 R10=fp0
> ; iter = 0; @ mptcp_bpf_iter.c:20
> 15: (07) r7 += -16                    ; R7_w=fp-16
> ; bpf_for_each(mptcp_subflow, subflow, msk) { @ mptcp_bpf_iter.c:23
> 16: (bf) r1 = r7                      ; R1_w=fp-16 R7_w=fp-16
> 17: (bf) r2 = r6                      ; R2_w=ptr_mptcp_sock()
> R6=ptr_mptcp_sock()
> 18: (85) call bpf_iter_mptcp_subflow_new#62694
> R2 must be referenced or trusted
> processed 17 insns (limit 1000000) max_states_per_insn 0 total_states 1
> peak_states 1 mark_read 1
> -- END PROG LOAD LOG --
> libbpf: prog 'trace_mptcp_sched_get_send': failed to load: -22
> libbpf: failed to load object 'mptcp_bpf_iter'
> libbpf: failed to load BPF skeleton 'mptcp_bpf_iter': -22
> test_bpf_iter:FAIL:skel_open_load: mptcp_iter unexpected error: -22
> #169/4   mptcp/bpf_iter:FAIL
> '''
>
> I don't know how to fix it yet.

And the verifier is pointing out the real problem (selftest is at [0]
for those interested).

struct mptcp_sock *msk argument for fentry/mptcp_sched_get_send can't
be trusted to be valid and be properly protected from being freed,
etc. You need to have kfuncs that will have KF_ACQUIRE semantics and
will take refcount or whatnot (or KF_RCU for RCU-protection).

See some examples where we do the same with tasks or maybe sockets,
I'm not very familiar with this area.

  [0] https://lore.kernel.org/mptcp/4467ab3af0f1a6c378778ca6be72753b16a1532c.1726132802.git.tanggeliang@kylinos.cn/

>
> > valid owned kernel object. Other BPF folks might help to clarify
> > this.
> >
> > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next)
> > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy)
> > > +BTF_ID_FLAGS(func, mptcp_subflow_active)
>
> But we do need to add KF_ITER_NEW/NEXT/DESTROY flags here:
>
> BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new, KF_ITER_NEW)
> BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next, KF_ITER_NEXT |
> KF_RET_NULL)
> BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy, KF_ITER_DESTROY)
>
> With these flags, we can drop this additional test in loop:
>
>         if (i++ >= MPTCP_SUBFLOWS_MAX)
>                 break;
>
>
> This problem has been bothering me for a while. I got "infinite loop
> detected" errors when walking the list with
> bpf_for_each(mptcp_subflow). So I had to use this additional test to
> break it.
>
> With these KF_ITER_NEW/NEXT/DESTROY flags, no need to use this
> additional test anymore.
>

These flags is what makes those functions an open-coded iterator
implementation, yes.

> Thanks,
> -Geliang
>
> [1]
> https://patchwork.kernel.org/project/mptcp/patch/4467ab3af0f1a6c378778ca6be72753b16a1532c.1726132802.git.tanggeliang@kylinos.cn/
>
> > >  BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled)
> > >  BTF_ID_FLAGS(func, bpf_mptcp_subflow_ctx_by_pos)
> > > -BTF_ID_FLAGS(func, mptcp_subflow_active)
> > >  BTF_ID_FLAGS(func, mptcp_set_timeout)
> > >  BTF_ID_FLAGS(func, mptcp_wnd_end)
> > >  BTF_ID_FLAGS(func, tcp_stream_memory_free)
> > > @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void)
> > >         int ret;
> > >
> > >         ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
> > > +       ret = ret ?:
> > > register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
> > > +
> > > &bpf_mptcp_sched_kfunc_set);
> > >         ret = ret ?:
> > > register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
> > >
> > > &bpf_mptcp_sched_kfunc_set);
> > >  #ifdef CONFIG_BPF_JIT
> > > --
> > > 2.43.0
> > >
> > >
>
Martin KaFai Lau Sept. 14, 2024, 12:41 a.m. UTC | #4
On 9/13/24 1:57 PM, Andrii Nakryiko wrote:
>>>> +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct
>>>> bpf_iter_mptcp_subflow *it,
>>>> +                                          struct mptcp_sock *msk)
>>>> +{
>>>> +       struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
>>>> +
>>>> +       kit->msk = msk;
>>>> +       if (!msk)
>>>> +               return -EINVAL;
>>>> +
>>>> +       kit->pos = &msk->conn_list;
>>>> +       return 0;
>>>> +}

[ ... ]

>>>>   BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
>>>> +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new)
>>>
>>> I'm not 100% sure, but I suspect you might need to specify
>>> KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is a

+1

>>>> @@ -241,6 +286,8 @@ static int __init bpf_mptcp_kfunc_init(void)
>>>>          int ret;
>>>>
>>>>          ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
>>>> +       ret = ret ?:
>>>> register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
>>>> +
>>>> &bpf_mptcp_sched_kfunc_set);

This cannot be used in tracing.

Going back to my earlier question in v1. How is the msk->conn_list protected?
Geliang Tang Sept. 14, 2024, 8:40 a.m. UTC | #5
Hi Martin, Andrii, Matt,

On Fri, 2024-09-13 at 17:41 -0700, Martin KaFai Lau wrote:
> On 9/13/24 1:57 PM, Andrii Nakryiko wrote:
> > > > > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct
> > > > > bpf_iter_mptcp_subflow *it,
> > > > > +                                          struct mptcp_sock
> > > > > *msk)
> > > > > +{
> > > > > +       struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> > > > > +
> > > > > +       kit->msk = msk;
> > > > > +       if (!msk)
> > > > > +               return -EINVAL;
> > > > > +
> > > > > +       kit->pos = &msk->conn_list;
> > > > > +       return 0;
> > > > > +}
> 
> [ ... ]
> 
> > > > >   BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
> > > > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new)
> > > > 
> > > > I'm not 100% sure, but I suspect you might need to specify
> > > > KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk` is
> > > > a
> 
> +1

So we must add KF_TRUSTED_ARGS flag, right?

> 
> > > > > @@ -241,6 +286,8 @@ static int __init
> > > > > bpf_mptcp_kfunc_init(void)
> > > > >          int ret;
> > > > > 
> > > > >          ret =
> > > > > register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
> > > > > +       ret = ret ?:
> > > > > register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
> > > > > +
> > > > > &bpf_mptcp_sched_kfunc_set);
> 
> This cannot be used in tracing.

Actually, we don’t need to use mptcp_subflow bpf_iter in tracing.

We plan to use it in MPTCP BPF packet schedulers, which are not
tracing, but "struct_ops" types. And they work well with
KF_TRUSTED_ARGS flag in bpf_iter_mptcp_subflow_new:

BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new, KF_ITER_NEW |
KF_TRUSTED_ARGS);

An example of the scheduler is:

SEC("struct_ops")
int BPF_PROG(bpf_first_get_subflow, struct mptcp_sock *msk,
             struct mptcp_sched_data *data)
{
        struct mptcp_subflow_context *subflow;

        bpf_rcu_read_lock();
        bpf_for_each(mptcp_subflow, subflow, msk) {
                mptcp_subflow_set_scheduled(subflow, true);
                break;
        }
        bpf_rcu_read_unlock();

        return 0;
}

SEC(".struct_ops")
struct mptcp_sched_ops first = { 
        .init           = (void *)mptcp_sched_first_init,
        .release        = (void *)mptcp_sched_first_release,
        .get_subflow    = (void *)bpf_first_get_subflow,
        .name           = "bpf_first",
};

But BPF mptcp_sched_ops code has not been merged into bpf-next yet, so
I simply test this bpf_for_each(mptcp_subflow) in tracing since I
noticed other bpf_iter selftests are using tracing too:

progs/iters_task.c
SEC("fentry.s/" SYS_PREFIX "sys_getpgid")

progs/iters_css.c
SEC("fentry.s/" SYS_PREFIX "sys_getpgid")

If this bpf_for_each(mptcp_subflow) can only be used in struct_ops, I
will try to move the selftest into a struct_ops.

> 
> Going back to my earlier question in v1. How is the msk->conn_list
> protected?
> 

msk->conn_list is protected by msk socket lock. (@Matt, am I right?) We
use this in kernel code:

	struct sock *sk = (struct sock *)msk;

	lock_sock(sk);
	kfunc(&msk->conn_list);
	release_sock(sk);

If so, should we also use lock_sock/release_sock in
bpf_iter_mptcp_subflow_next()?

Thanks,
-Geliang
Geliang Tang Sept. 14, 2024, 10:12 a.m. UTC | #6
On Sat, 2024-09-14 at 16:40 +0800, Geliang Tang wrote:
> Hi Martin, Andrii, Matt,
> 
> On Fri, 2024-09-13 at 17:41 -0700, Martin KaFai Lau wrote:
> > On 9/13/24 1:57 PM, Andrii Nakryiko wrote:
> > > > > > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct
> > > > > > bpf_iter_mptcp_subflow *it,
> > > > > > +                                          struct
> > > > > > mptcp_sock
> > > > > > *msk)
> > > > > > +{
> > > > > > +       struct bpf_iter_mptcp_subflow_kern *kit = (void
> > > > > > *)it;
> > > > > > +
> > > > > > +       kit->msk = msk;
> > > > > > +       if (!msk)
> > > > > > +               return -EINVAL;
> > > > > > +
> > > > > > +       kit->pos = &msk->conn_list;
> > > > > > +       return 0;
> > > > > > +}
> > 
> > [ ... ]
> > 
> > > > > >   BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
> > > > > > +BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new)
> > > > > 
> > > > > I'm not 100% sure, but I suspect you might need to specify
> > > > > KF_TRUSTED_ARGS here to ensure that `struct mptcp_sock *msk`
> > > > > is
> > > > > a
> > 
> > +1
> 
> So we must add KF_TRUSTED_ARGS flag, right?
> 
> > 
> > > > > > @@ -241,6 +286,8 @@ static int __init
> > > > > > bpf_mptcp_kfunc_init(void)
> > > > > >          int ret;
> > > > > > 
> > > > > >          ret =
> > > > > > register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
> > > > > > +       ret = ret ?:
> > > > > > register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
> > > > > > +
> > > > > > &bpf_mptcp_sched_kfunc_set);
> > 
> > This cannot be used in tracing.
> 
> Actually, we don’t need to use mptcp_subflow bpf_iter in tracing.
> 
> We plan to use it in MPTCP BPF packet schedulers, which are not
> tracing, but "struct_ops" types. And they work well with
> KF_TRUSTED_ARGS flag in bpf_iter_mptcp_subflow_new:
> 
> BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new, KF_ITER_NEW |
> KF_TRUSTED_ARGS);
> 
> An example of the scheduler is:
> 
> SEC("struct_ops")
> int BPF_PROG(bpf_first_get_subflow, struct mptcp_sock *msk,
>              struct mptcp_sched_data *data)
> {
>         struct mptcp_subflow_context *subflow;
> 
>         bpf_rcu_read_lock();
>         bpf_for_each(mptcp_subflow, subflow, msk) {
>                 mptcp_subflow_set_scheduled(subflow, true);
>                 break;
>         }
>         bpf_rcu_read_unlock();
> 
>         return 0;
> }
> 
> SEC(".struct_ops")
> struct mptcp_sched_ops first = { 
>         .init           = (void *)mptcp_sched_first_init,
>         .release        = (void *)mptcp_sched_first_release,
>         .get_subflow    = (void *)bpf_first_get_subflow,
>         .name           = "bpf_first",
> };
> 
> But BPF mptcp_sched_ops code has not been merged into bpf-next yet,
> so
> I simply test this bpf_for_each(mptcp_subflow) in tracing since I
> noticed other bpf_iter selftests are using tracing too:
> 
> progs/iters_task.c
> SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
> 
> progs/iters_css.c
> SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
> 
> If this bpf_for_each(mptcp_subflow) can only be used in struct_ops, I
> will try to move the selftest into a struct_ops.
> 
> > 
> > Going back to my earlier question in v1. How is the msk->conn_list
> > protected?
> > 
> 
> msk->conn_list is protected by msk socket lock. (@Matt, am I right?)
> We
> use this in kernel code:
> 
> 	struct sock *sk = (struct sock *)msk;
> 
> 	lock_sock(sk);
> 	kfunc(&msk->conn_list);
> 	release_sock(sk);
> 
> If so, should we also use lock_sock/release_sock in
> bpf_iter_mptcp_subflow_next()?

bpf_for_each(mptcp_subflow) is expected to be used in the get_subflow()
interface of mptcp_sched_ops to traverse all subflows and then pick one
subflow to send data. This interface is invoked in
mptcp_sched_get_send(), here the msk socket lock is hold already:

int mptcp_sched_get_send(struct mptcp_sock *msk)
{
        struct mptcp_subflow_context *subflow;
        struct mptcp_sched_data data;

        msk_owned_by_me(msk);

	... ...

        mptcp_for_each_subflow(msk, subflow) {
                if (READ_ONCE(subflow->scheduled))
                        return 0;
        }

        data.reinject = false;
        if (msk->sched == &mptcp_sched_default || !msk->sched)
                return mptcp_sched_default_get_subflow(msk, &data);
        return msk->sched->get_subflow(msk, &data);
}

So no need to hold msk socket lock again in BPF schedulers. This means
we can walk the conn_list without any lock. bpf_rcu_read_lock() and
bpf_rcu_read_unlock() can be dropped in BPF schedulers too. Am I right?

Thanks,
-Geliang
diff mbox series

Patch

diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
index 6414824402e6..fec18e7e5e4a 100644
--- a/net/mptcp/bpf.c
+++ b/net/mptcp/bpf.c
@@ -201,9 +201,51 @@  static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = {
 	.set   = &bpf_mptcp_fmodret_ids,
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "kfuncs which will be used in BPF programs");
+struct bpf_iter_mptcp_subflow {
+	__u64 __opaque[2];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_mptcp_subflow_kern {
+	struct mptcp_sock *msk;
+	struct list_head *pos;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct bpf_iter_mptcp_subflow *it,
+					   struct mptcp_sock *msk)
+{
+	struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
+
+	kit->msk = msk;
+	if (!msk)
+		return -EINVAL;
+
+	kit->pos = &msk->conn_list;
+	return 0;
+}
+
+__bpf_kfunc struct mptcp_subflow_context *
+bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it)
+{
+	struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
+	struct mptcp_subflow_context *subflow;
+	struct mptcp_sock *msk = kit->msk;
+
+	if (!msk)
+		return NULL;
+
+	subflow = list_entry(kit->pos->next, struct mptcp_subflow_context, node);
+	if (!subflow || list_entry_is_head(subflow, &msk->conn_list, node))
+		return NULL;
+
+	kit->pos = &subflow->node;
+	return subflow;
+}
+
+__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct bpf_iter_mptcp_subflow *it)
+{
+}
 
 __bpf_kfunc struct mptcp_subflow_context *
 bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data, unsigned int pos)
@@ -218,12 +260,15 @@  __bpf_kfunc bool bpf_mptcp_subflow_queues_empty(struct sock *sk)
 	return tcp_rtx_queue_empty(sk);
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_new)
+BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_next)
+BTF_ID_FLAGS(func, bpf_iter_mptcp_subflow_destroy)
+BTF_ID_FLAGS(func, mptcp_subflow_active)
 BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled)
 BTF_ID_FLAGS(func, bpf_mptcp_subflow_ctx_by_pos)
-BTF_ID_FLAGS(func, mptcp_subflow_active)
 BTF_ID_FLAGS(func, mptcp_set_timeout)
 BTF_ID_FLAGS(func, mptcp_wnd_end)
 BTF_ID_FLAGS(func, tcp_stream_memory_free)
@@ -241,6 +286,8 @@  static int __init bpf_mptcp_kfunc_init(void)
 	int ret;
 
 	ret = register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+					       &bpf_mptcp_sched_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					       &bpf_mptcp_sched_kfunc_set);
 #ifdef CONFIG_BPF_JIT