diff mbox series

[mptcp-next,v3,1/2] mptcp: add bpf_iter_task for mptcp_sock

Message ID e04cc2fa5f500097ac7d95afb94f68d0d8dc0057.1741577149.git.tanggeliang@kylinos.cn (mailing list archive)
State Rejected, archived
Headers show
Series add bpf_iter_task | expand

Checks

Context Check Description
matttbe/build success Build and static analysis OK
matttbe/checkpatch success total: 0 errors, 0 warnings, 0 checks, 81 lines checked
matttbe/shellcheck success MPTCP selftests files have not been modified
matttbe/KVM_Validation__normal success Success! ✅
matttbe/KVM_Validation__debug fail Critical: Global Timeout ❌
matttbe/KVM_Validation__btf-normal__only_bpftest_all_ success Success! ✅
matttbe/KVM_Validation__btf-debug__only_bpftest_all_ success Success! ✅

Commit Message

Geliang Tang March 10, 2025, 3:30 a.m. UTC
From: Geliang Tang <tanggeliang@kylinos.cn>

To make sure the mptcp_subflow bpf_iter is running in the
MPTCP context. This patch adds a simplified version of tracking
for it:

1. Add a 'struct task_struct *bpf_iter_task' field to struct
mptcp_sock.

2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling
a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after
the hook returns.

3. In bpf_iter_mptcp_subflow_new(), check

	"READ_ONCE(msk->bpf_scheduler_task) == current"

to confirm the correct task, return -EINVAL if it doesn't match.

Also creates helpers for setting, clearing and checking that value.

Suggested-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/bpf.c      |  2 ++
 net/mptcp/protocol.c |  1 +
 net/mptcp/protocol.h | 20 ++++++++++++++++++++
 net/mptcp/sched.c    | 15 +++++++++++----
 4 files changed, 34 insertions(+), 4 deletions(-)

Comments

Geliang Tang March 17, 2025, 9:41 a.m. UTC | #1
On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
> From: Geliang Tang <tanggeliang@kylinos.cn>
> 
> To make sure the mptcp_subflow bpf_iter is running in the
> MPTCP context. This patch adds a simplified version of tracking
> for it:
> 
> 1. Add a 'struct task_struct *bpf_iter_task' field to struct
> mptcp_sock.
> 
> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling
> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after
> the hook returns.
> 
> 3. In bpf_iter_mptcp_subflow_new(), check
> 
> 	"READ_ONCE(msk->bpf_scheduler_task) == current"
> 
> to confirm the correct task, return -EINVAL if it doesn't match.
> 
> Also creates helpers for setting, clearing and checking that value.
> 
> Suggested-by: Mat Martineau <martineau@kernel.org>
> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> ---
>  net/mptcp/bpf.c      |  2 ++
>  net/mptcp/protocol.c |  1 +
>  net/mptcp/protocol.h | 20 ++++++++++++++++++++
>  net/mptcp/sched.c    | 15 +++++++++++----
>  4 files changed, 34 insertions(+), 4 deletions(-)
> 
> diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
> index c0da9ac077e4..0a78604742c7 100644
> --- a/net/mptcp/bpf.c
> +++ b/net/mptcp/bpf.c
> @@ -261,6 +261,8 @@ bpf_iter_mptcp_subflow_new(struct
> bpf_iter_mptcp_subflow *it,
>  		return -EINVAL;
>  
>  	msk = mptcp_sk(sk);
> +	if (!mptcp_check_bpf_iter_task(msk))
> +		return -EINVAL;
>  
>  	msk_owned_by_me(msk);
>  
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 01157ad2e2dc..d98e48ce8cd8 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -2729,6 +2729,7 @@ static void __mptcp_init_sock(struct sock *sk)
>  	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
>  	WRITE_ONCE(msk->csum_enabled,
> mptcp_is_checksum_enabled(sock_net(sk)));
>  	WRITE_ONCE(msk->allow_infinite_fallback, true);
> +	mptcp_clear_bpf_iter_task(msk);
>  	msk->recovery = false;
>  	msk->subflow_id = 1;
>  	msk->last_data_sent = tcp_jiffies32;
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 3492b256ecba..1c6958d64291 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -334,6 +334,7 @@ struct mptcp_sock {
>  				 */
>  	struct mptcp_pm_data	pm;
>  	struct mptcp_sched_ops	*sched;
> +	struct task_struct *bpf_iter_task;
>  	struct {
>  		u32	space;	/* bytes copied in last measurement
> window */
>  		u32	copied; /* bytes copied in this measurement
> window */
> @@ -1291,4 +1292,23 @@ mptcp_token_join_cookie_init_state(struct
> mptcp_subflow_request_sock *subflow_re
>  static inline void mptcp_join_cookie_init(void) {}
>  #endif
>  
> +static inline void mptcp_set_bpf_iter_task(struct mptcp_sock *msk)
> +{
> +	WRITE_ONCE(msk->bpf_iter_task, current);
> +}
> +
> +static inline void mptcp_clear_bpf_iter_task(struct mptcp_sock *msk)
> +{
> +	WRITE_ONCE(msk->bpf_iter_task, NULL);
> +}
> +
> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock *msk)
> +{
> +	struct task_struct *task = READ_ONCE(msk->bpf_iter_task);
> +
> +	if (task && task == current)
> +		return true;
> +	return false;
> +}

This v3 has a bug. When I was testing MPTCP BPF selftests in a loop, I
found that the test would break in some cases. After debugging, I found
that "task" and "current" were not equal:

[  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
current=0000000024db2987

I will try to fix it, but haven't found a solution yet.

Thanks,
-Geliang

> +
>  #endif /* __MPTCP_PROTOCOL_H */
> diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
> index f09f7eb1d63f..161398f8960c 100644
> --- a/net/mptcp/sched.c
> +++ b/net/mptcp/sched.c
> @@ -155,6 +155,7 @@ void mptcp_subflow_set_scheduled(struct
> mptcp_subflow_context *subflow,
>  int mptcp_sched_get_send(struct mptcp_sock *msk)
>  {
>  	struct mptcp_subflow_context *subflow;
> +	int ret;
>  
>  	msk_owned_by_me(msk);
>  
> @@ -176,12 +177,16 @@ int mptcp_sched_get_send(struct mptcp_sock
> *msk)
>  
>  	if (msk->sched == &mptcp_sched_default || !msk->sched)
>  		return mptcp_sched_default_get_send(msk);
> -	return msk->sched->get_send(msk);
> +	mptcp_set_bpf_iter_task(msk);
> +	ret = msk->sched->get_send(msk);
> +	mptcp_clear_bpf_iter_task(msk);
> +	return ret;
>  }
>  
>  int mptcp_sched_get_retrans(struct mptcp_sock *msk)
>  {
>  	struct mptcp_subflow_context *subflow;
> +	int ret;
>  
>  	msk_owned_by_me(msk);
>  
> @@ -196,7 +201,9 @@ int mptcp_sched_get_retrans(struct mptcp_sock
> *msk)
>  
>  	if (msk->sched == &mptcp_sched_default || !msk->sched)
>  		return mptcp_sched_default_get_retrans(msk);
> -	if (msk->sched->get_retrans)
> -		return msk->sched->get_retrans(msk);
> -	return msk->sched->get_send(msk);
> +	mptcp_set_bpf_iter_task(msk);
> +	ret = msk->sched->get_retrans ? msk->sched->get_retrans(msk)
> :
> +					msk->sched->get_send(msk);
> +	mptcp_clear_bpf_iter_task(msk);
> +	return ret;
>  }
Matthieu Baerts March 17, 2025, 10:29 a.m. UTC | #2
Hi Geliang, Mat,

On 17/03/2025 10:41, Geliang Tang wrote:
> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
>> From: Geliang Tang <tanggeliang@kylinos.cn>
>>
>> To make sure the mptcp_subflow bpf_iter is running in the
>> MPTCP context. This patch adds a simplified version of tracking
>> for it:
>>
>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct
>> mptcp_sock.
>>
>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling
>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after
>> the hook returns.
>>
>> 3. In bpf_iter_mptcp_subflow_new(), check
>>
>> 	"READ_ONCE(msk->bpf_scheduler_task) == current"
>>
>> to confirm the correct task, return -EINVAL if it doesn't match.
>>
>> Also creates helpers for setting, clearing and checking that value.
>>
>> Suggested-by: Mat Martineau <martineau@kernel.org>
>> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
>> ---
>>  net/mptcp/bpf.c      |  2 ++
>>  net/mptcp/protocol.c |  1 +
>>  net/mptcp/protocol.h | 20 ++++++++++++++++++++
>>  net/mptcp/sched.c    | 15 +++++++++++----
>>  4 files changed, 34 insertions(+), 4 deletions(-)
>>
>> diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
>> index c0da9ac077e4..0a78604742c7 100644
>> --- a/net/mptcp/bpf.c
>> +++ b/net/mptcp/bpf.c
>> @@ -261,6 +261,8 @@ bpf_iter_mptcp_subflow_new(struct
>> bpf_iter_mptcp_subflow *it,
>>  		return -EINVAL;
>>  
>>  	msk = mptcp_sk(sk);
>> +	if (!mptcp_check_bpf_iter_task(msk))
>> +		return -EINVAL;
>>  
>>  	msk_owned_by_me(msk);
>>  
>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>> index 01157ad2e2dc..d98e48ce8cd8 100644
>> --- a/net/mptcp/protocol.c
>> +++ b/net/mptcp/protocol.c
>> @@ -2729,6 +2729,7 @@ static void __mptcp_init_sock(struct sock *sk)
>>  	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
>>  	WRITE_ONCE(msk->csum_enabled,
>> mptcp_is_checksum_enabled(sock_net(sk)));
>>  	WRITE_ONCE(msk->allow_infinite_fallback, true);
>> +	mptcp_clear_bpf_iter_task(msk);
>>  	msk->recovery = false;
>>  	msk->subflow_id = 1;
>>  	msk->last_data_sent = tcp_jiffies32;
>> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
>> index 3492b256ecba..1c6958d64291 100644
>> --- a/net/mptcp/protocol.h
>> +++ b/net/mptcp/protocol.h
>> @@ -334,6 +334,7 @@ struct mptcp_sock {
>>  				 */
>>  	struct mptcp_pm_data	pm;
>>  	struct mptcp_sched_ops	*sched;
>> +	struct task_struct *bpf_iter_task;
>>  	struct {
>>  		u32	space;	/* bytes copied in last measurement
>> window */
>>  		u32	copied; /* bytes copied in this measurement
>> window */
>> @@ -1291,4 +1292,23 @@ mptcp_token_join_cookie_init_state(struct
>> mptcp_subflow_request_sock *subflow_re
>>  static inline void mptcp_join_cookie_init(void) {}
>>  #endif
>>  
>> +static inline void mptcp_set_bpf_iter_task(struct mptcp_sock *msk)
>> +{
>> +	WRITE_ONCE(msk->bpf_iter_task, current);
>> +}
>> +
>> +static inline void mptcp_clear_bpf_iter_task(struct mptcp_sock *msk)
>> +{
>> +	WRITE_ONCE(msk->bpf_iter_task, NULL);
>> +}
>> +
>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock *msk)
>> +{
>> +	struct task_struct *task = READ_ONCE(msk->bpf_iter_task);
>> +
>> +	if (task && task == current)
>> +		return true;
>> +	return false;
>> +}
> 
> This v3 has a bug. When I was testing MPTCP BPF selftests in a loop, I
> found that the test would break in some cases. After debugging, I found
> that "task" and "current" were not equal:
> 
> [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
> current=0000000024db2987
> 
> I will try to fix it, but haven't found a solution yet.

(sorry for the delay, I need a bit of time to catch up)

I talked a bit to Alexei Starovoitov last week. He told me that with the
BPF struct_ops, it is possible to tell the verifier that some locks are
taken either by some struct_ops types, or even per callbacks of some
specific struct_ops (WIP on sched_ext side). It is also possible to get
some locks automatically (polymorphism), and there are examples on VFS side.

In other words, it means we don't need to add this "bpf_iter_task",
there are other techniques, but I don't have more details, and I didn't
check in the code. If it is not clear for you and you don't find  other
examples elsewhere (sched_ext? check WIP patches maybe?), then Alexei
said we should not hesitate to ask questions on the BPF mailing list.

Cheers,
Matt
Geliang Tang March 17, 2025, 10:59 a.m. UTC | #3
Hi Matt,

On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote:
> Hi Geliang, Mat,
> 
> On 17/03/2025 10:41, Geliang Tang wrote:
> > On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
> > > From: Geliang Tang <tanggeliang@kylinos.cn>
> > > 
> > > To make sure the mptcp_subflow bpf_iter is running in the
> > > MPTCP context. This patch adds a simplified version of tracking
> > > for it:
> > > 
> > > 1. Add a 'struct task_struct *bpf_iter_task' field to struct
> > > mptcp_sock.
> > > 
> > > 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling
> > > a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after
> > > the hook returns.
> > > 
> > > 3. In bpf_iter_mptcp_subflow_new(), check
> > > 
> > > 	"READ_ONCE(msk->bpf_scheduler_task) == current"
> > > 
> > > to confirm the correct task, return -EINVAL if it doesn't match.
> > > 
> > > Also creates helpers for setting, clearing and checking that
> > > value.
> > > 
> > > Suggested-by: Mat Martineau <martineau@kernel.org>
> > > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> > > ---
> > >  net/mptcp/bpf.c      |  2 ++
> > >  net/mptcp/protocol.c |  1 +
> > >  net/mptcp/protocol.h | 20 ++++++++++++++++++++
> > >  net/mptcp/sched.c    | 15 +++++++++++----
> > >  4 files changed, 34 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
> > > index c0da9ac077e4..0a78604742c7 100644
> > > --- a/net/mptcp/bpf.c
> > > +++ b/net/mptcp/bpf.c
> > > @@ -261,6 +261,8 @@ bpf_iter_mptcp_subflow_new(struct
> > > bpf_iter_mptcp_subflow *it,
> > >  		return -EINVAL;
> > >  
> > >  	msk = mptcp_sk(sk);
> > > +	if (!mptcp_check_bpf_iter_task(msk))
> > > +		return -EINVAL;
> > >  
> > >  	msk_owned_by_me(msk);
> > >  
> > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > > index 01157ad2e2dc..d98e48ce8cd8 100644
> > > --- a/net/mptcp/protocol.c
> > > +++ b/net/mptcp/protocol.c
> > > @@ -2729,6 +2729,7 @@ static void __mptcp_init_sock(struct sock
> > > *sk)
> > >  	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
> > >  	WRITE_ONCE(msk->csum_enabled,
> > > mptcp_is_checksum_enabled(sock_net(sk)));
> > >  	WRITE_ONCE(msk->allow_infinite_fallback, true);
> > > +	mptcp_clear_bpf_iter_task(msk);
> > >  	msk->recovery = false;
> > >  	msk->subflow_id = 1;
> > >  	msk->last_data_sent = tcp_jiffies32;
> > > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> > > index 3492b256ecba..1c6958d64291 100644
> > > --- a/net/mptcp/protocol.h
> > > +++ b/net/mptcp/protocol.h
> > > @@ -334,6 +334,7 @@ struct mptcp_sock {
> > >  				 */
> > >  	struct mptcp_pm_data	pm;
> > >  	struct mptcp_sched_ops	*sched;
> > > +	struct task_struct *bpf_iter_task;
> > >  	struct {
> > >  		u32	space;	/* bytes copied in last
> > > measurement
> > > window */
> > >  		u32	copied; /* bytes copied in this
> > > measurement
> > > window */
> > > @@ -1291,4 +1292,23 @@ mptcp_token_join_cookie_init_state(struct
> > > mptcp_subflow_request_sock *subflow_re
> > >  static inline void mptcp_join_cookie_init(void) {}
> > >  #endif
> > >  
> > > +static inline void mptcp_set_bpf_iter_task(struct mptcp_sock
> > > *msk)
> > > +{
> > > +	WRITE_ONCE(msk->bpf_iter_task, current);
> > > +}
> > > +
> > > +static inline void mptcp_clear_bpf_iter_task(struct mptcp_sock
> > > *msk)
> > > +{
> > > +	WRITE_ONCE(msk->bpf_iter_task, NULL);
> > > +}
> > > +
> > > +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock
> > > *msk)
> > > +{
> > > +	struct task_struct *task = READ_ONCE(msk-
> > > >bpf_iter_task);
> > > +
> > > +	if (task && task == current)
> > > +		return true;
> > > +	return false;
> > > +}
> > 
> > This v3 has a bug. When I was testing MPTCP BPF selftests in a
> > loop, I
> > found that the test would break in some cases. After debugging, I
> > found
> > that "task" and "current" were not equal:
> > 
> > [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
> > msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
> > current=0000000024db2987
> > 
> > I will try to fix it, but haven't found a solution yet.
> 
> (sorry for the delay, I need a bit of time to catch up)
> 
> I talked a bit to Alexei Starovoitov last week. He told me that with
> the
> BPF struct_ops, it is possible to tell the verifier that some locks
> are
> taken either by some struct_ops types, or even per callbacks of some
> specific struct_ops (WIP on sched_ext side). It is also possible to
> get
> some locks automatically (polymorphism), and there are examples on
> VFS side.

Thanks for your reminder, I will look at these BPF codes. Our goal is
to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP
BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops
are not allowed to use mptcp_subflow bpf_iter.

> 
> In other words, it means we don't need to add this "bpf_iter_task",
> there are other techniques, but I don't have more details, and I
> didn't
> check in the code. If it is not clear for you and you don't find 
> other
> examples elsewhere (sched_ext? check WIP patches maybe?), then Alexei
> said we should not hesitate to ask questions on the BPF mailing list.

We can send "Add mptcp_subflow bpf_iter support" v3 to the BPF mailing
list and ask questions during the reviewing process.

Thanks,
-Geliang

> 
> Cheers,
> Matt
Matthieu Baerts March 17, 2025, 1:57 p.m. UTC | #4
Hi Geliang,

On 17/03/2025 11:59, Geliang Tang wrote:
> Hi Matt,
> 
> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote:
>> Hi Geliang, Mat,
>>
>> On 17/03/2025 10:41, Geliang Tang wrote:
>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
>>>> From: Geliang Tang <tanggeliang@kylinos.cn>
>>>>
>>>> To make sure the mptcp_subflow bpf_iter is running in the
>>>> MPTCP context. This patch adds a simplified version of tracking
>>>> for it:
>>>>
>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct
>>>> mptcp_sock.
>>>>
>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling
>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after
>>>> the hook returns.
>>>>
>>>> 3. In bpf_iter_mptcp_subflow_new(), check
>>>>
>>>> 	"READ_ONCE(msk->bpf_scheduler_task) == current"
>>>>
>>>> to confirm the correct task, return -EINVAL if it doesn't match.
>>>>
>>>> Also creates helpers for setting, clearing and checking that
>>>> value.

(...)

>>>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock
>>>> *msk)
>>>> +{
>>>> +	struct task_struct *task = READ_ONCE(msk-
>>>>> bpf_iter_task);
>>>> +
>>>> +	if (task && task == current)
>>>> +		return true;
>>>> +	return false;
>>>> +}
>>>
>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a
>>> loop, I
>>> found that the test would break in some cases. After debugging, I
>>> found
>>> that "task" and "current" were not equal:
>>>
>>> [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
>>> current=0000000024db2987
>>>
>>> I will try to fix it, but haven't found a solution yet.
>>
>> (sorry for the delay, I need a bit of time to catch up)
>>
>> I talked a bit to Alexei Starovoitov last week. He told me that with
>> the
>> BPF struct_ops, it is possible to tell the verifier that some locks
>> are
>> taken either by some struct_ops types, or even per callbacks of some
>> specific struct_ops (WIP on sched_ext side). It is also possible to
>> get
>> some locks automatically (polymorphism), and there are examples on
>> VFS side.
> 
> Thanks for your reminder, I will look at these BPF codes. Our goal is
> to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP
> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops
> are not allowed to use mptcp_subflow bpf_iter.

Yes, that's correct. I didn't check, but **maybe** some mptcp_sched_ops
and mptcp_pm_ops callback might not be allowed to use bpf_iter. In this
case, it might be needed to allow only some of them to use bpf_iter.

Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be
done while holding the msk lock, e.g. being called from the worker and
not from a subflow event, etc. but maybe there are some exceptions needed.

>> In other words, it means we don't need to add this "bpf_iter_task",
>> there are other techniques, but I don't have more details, and I
>> didn't
>> check in the code. If it is not clear for you and you don't find 
>> other
>> examples elsewhere (sched_ext? check WIP patches maybe?), then Alexei
>> said we should not hesitate to ask questions on the BPF mailing list.
> 
> We can send "Add mptcp_subflow bpf_iter support" v3 to the BPF mailing
> list and ask questions during the reviewing process.

Indeed. I hope to be able to find a bit of time this week to send the v3.

Cheers,
Matt
Mat Martineau March 18, 2025, 1:25 a.m. UTC | #5
On Mon, 17 Mar 2025, Matthieu Baerts wrote:

> Hi Geliang,
>
> On 17/03/2025 11:59, Geliang Tang wrote:
>> Hi Matt,
>>
>> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote:
>>> Hi Geliang, Mat,
>>>
>>> On 17/03/2025 10:41, Geliang Tang wrote:
>>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
>>>>> From: Geliang Tang <tanggeliang@kylinos.cn>
>>>>>
>>>>> To make sure the mptcp_subflow bpf_iter is running in the
>>>>> MPTCP context. This patch adds a simplified version of tracking
>>>>> for it:
>>>>>
>>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct
>>>>> mptcp_sock.
>>>>>
>>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling
>>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after
>>>>> the hook returns.
>>>>>
>>>>> 3. In bpf_iter_mptcp_subflow_new(), check
>>>>>
>>>>> 	"READ_ONCE(msk->bpf_scheduler_task) == current"
>>>>>
>>>>> to confirm the correct task, return -EINVAL if it doesn't match.
>>>>>
>>>>> Also creates helpers for setting, clearing and checking that
>>>>> value.
>
> (...)
>
>>>>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock
>>>>> *msk)
>>>>> +{
>>>>> +	struct task_struct *task = READ_ONCE(msk-
>>>>>> bpf_iter_task);
>>>>> +
>>>>> +	if (task && task == current)
>>>>> +		return true;
>>>>> +	return false;
>>>>> +}
>>>>
>>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a
>>>> loop, I
>>>> found that the test would break in some cases. After debugging, I
>>>> found
>>>> that "task" and "current" were not equal:
>>>>
>>>> [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
>>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
>>>> current=0000000024db2987
>>>>
>>>> I will try to fix it, but haven't found a solution yet.
>>>
>>> (sorry for the delay, I need a bit of time to catch up)
>>>
>>> I talked a bit to Alexei Starovoitov last week. He told me that with
>>> the
>>> BPF struct_ops, it is possible to tell the verifier that some locks
>>> are
>>> taken either by some struct_ops types, or even per callbacks of some
>>> specific struct_ops (WIP on sched_ext side). It is also possible to
>>> get
>>> some locks automatically (polymorphism), and there are examples on
>>> VFS side.
>>
>> Thanks for your reminder, I will look at these BPF codes. Our goal is
>> to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP
>> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops
>> are not allowed to use mptcp_subflow bpf_iter.
>
> Yes, that's correct. I didn't check, but **maybe** some mptcp_sched_ops
> and mptcp_pm_ops callback might not be allowed to use bpf_iter. In this
> case, it might be needed to allow only some of them to use bpf_iter.
>
> Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be
> done while holding the msk lock, e.g. being called from the worker and
> not from a subflow event, etc. but maybe there are some exceptions needed.

Hi Matthieu -

Is this last paragraph also feedback from Alexei? Can you elaborate a 
little - is the lock needed for BPF-specific reasons or to avoid having 
the callback modify msk state?

I'll try to look around at the verifier and automatic locking techniques 
you mentioned above, it would be great to have a better way to handle 
socket locks w/ BPF.

- Mat

>
>>> In other words, it means we don't need to add this "bpf_iter_task",
>>> there are other techniques, but I don't have more details, and I
>>> didn't
>>> check in the code. If it is not clear for you and you don't find
>>> other
>>> examples elsewhere (sched_ext? check WIP patches maybe?), then Alexei
>>> said we should not hesitate to ask questions on the BPF mailing list.
>>
>> We can send "Add mptcp_subflow bpf_iter support" v3 to the BPF mailing
>> list and ask questions during the reviewing process.
>
> Indeed. I hope to be able to find a bit of time this week to send the v3.
>
> Cheers,
> Matt
> -- 
> Sponsored by the NGI0 Core fund.
>
>
Geliang Tang March 18, 2025, 10:35 a.m. UTC | #6
Hi Matt,

On Mon, 2025-03-17 at 14:57 +0100, Matthieu Baerts wrote:
> Hi Geliang,
> 
> On 17/03/2025 11:59, Geliang Tang wrote:
> > Hi Matt,
> > 
> > On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote:
> > > Hi Geliang, Mat,
> > > 
> > > On 17/03/2025 10:41, Geliang Tang wrote:
> > > > On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
> > > > > From: Geliang Tang <tanggeliang@kylinos.cn>
> > > > > 
> > > > > To make sure the mptcp_subflow bpf_iter is running in the
> > > > > MPTCP context. This patch adds a simplified version of
> > > > > tracking
> > > > > for it:
> > > > > 
> > > > > 1. Add a 'struct task_struct *bpf_iter_task' field to struct
> > > > > mptcp_sock.
> > > > > 
> > > > > 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before
> > > > > calling
> > > > > a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL)
> > > > > after
> > > > > the hook returns.
> > > > > 
> > > > > 3. In bpf_iter_mptcp_subflow_new(), check
> > > > > 
> > > > > 	"READ_ONCE(msk->bpf_scheduler_task) == current"
> > > > > 
> > > > > to confirm the correct task, return -EINVAL if it doesn't
> > > > > match.
> > > > > 
> > > > > Also creates helpers for setting, clearing and checking that
> > > > > value.
> 
> (...)
> 
> > > > > +static inline bool mptcp_check_bpf_iter_task(struct
> > > > > mptcp_sock
> > > > > *msk)
> > > > > +{
> > > > > +	struct task_struct *task = READ_ONCE(msk-
> > > > > > bpf_iter_task);
> > > > > +
> > > > > +	if (task && task == current)
> > > > > +		return true;
> > > > > +	return false;
> > > > > +}
> > > > 
> > > > This v3 has a bug. When I was testing MPTCP BPF selftests in a
> > > > loop, I
> > > > found that the test would break in some cases. After debugging,
> > > > I
> > > > found
> > > > that "task" and "current" were not equal:
> > > > 
> > > > [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
> > > > msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
> > > > current=0000000024db2987
> > > > 
> > > > I will try to fix it, but haven't found a solution yet.
> > > 
> > > (sorry for the delay, I need a bit of time to catch up)
> > > 
> > > I talked a bit to Alexei Starovoitov last week. He told me that
> > > with
> > > the
> > > BPF struct_ops, it is possible to tell the verifier that some
> > > locks
> > > are
> > > taken either by some struct_ops types, or even per callbacks of
> > > some
> > > specific struct_ops (WIP on sched_ext side). It is also possible
> > > to
> > > get
> > > some locks automatically (polymorphism), and there are examples
> > > on
> > > VFS side.
> > 
> > Thanks for your reminder, I will look at these BPF codes. Our goal
> > is
> > to make mptcp_subflow bpf_iter only used by struct_ops defined by
> > MPTCP
> > BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other
> > struct_ops
> > are not allowed to use mptcp_subflow bpf_iter.

I checked and found that other struct_ops **cannot** use mptcp_subflow
bpf_iter. Although we registered this bpf_iter for use with
BPF_PROG_TYPE_STRUCT_OPS type, we checked in bpf_iter_mptcp_subflow_new
that sk->sk_protocol must be IPPROTO_MPTCP. Does this mean that other
struct_ops cannot use mptcp_subflow bpf_iter successfully? I don't know
if this check is sufficient.

> 
> Yes, that's correct. I didn't check, but **maybe** some
> mptcp_sched_ops
> and mptcp_pm_ops callback might not be allowed to use bpf_iter. In
> this
> case, it might be needed to allow only some of them to use bpf_iter.

I guess it's not easy to allow only some callbacks of a struct_ops to
access certain functions, because The BPF verifier verifies the
struct_ops as a whole. But I will continue to look for a solution.

> 
> Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be
> done while holding the msk lock, e.g. being called from the worker
> and
> not from a subflow event, etc. but maybe there are some exceptions
> needed.

I checked the following 19 callbacks of mptcp_{pm,sched}_ops that have
been implemented in my code tree. Except for the three exceptions
(get_local_id, get_priority and add_addr_received), the other 17
callbacks are all be done while holding the msk lock:

struct mptcp_sched_ops {
        int (*get_send)(struct mptcp_sock *msk);
        int (*get_retrans)(struct mptcp_sock *msk);
}

struct mptcp_pm_ops {
        /* required */
        int (*get_local_id)(struct mptcp_sock *msk,
                            struct mptcp_pm_addr_entry *skc);
        bool (*get_priority)(struct mptcp_sock *msk,
                             struct mptcp_addr_info *skc);

        /* optional */
        void (*established)(struct mptcp_sock *msk);
        void (*subflow_established)(struct mptcp_sock *msk);

        /* required */
        bool (*allow_new_subflow)(struct mptcp_sock *msk);
        bool (*accept_new_subflow)(const struct mptcp_sock *msk);
        bool (*add_addr_echo)(struct mptcp_sock *msk,
                              const struct mptcp_addr_info *addr);

        /* optional */
        int (*add_addr_received)(struct mptcp_sock *msk,
                                 const struct mptcp_addr_info *addr);
        void (*rm_addr_received)(struct mptcp_sock *msk);

        /* optional */
        int (*add_addr)(struct mptcp_sock *msk,
                        struct mptcp_pm_addr_entry *entry);
        int (*del_addr)(struct mptcp_sock *msk,
                        const struct mptcp_pm_addr_entry *entry);
        int (*flush_addrs)(struct mptcp_sock *msk,
                           struct list_head *rm_list);

        /* optional */
        int (*address_announce)(struct mptcp_sock *msk,
                                struct mptcp_pm_addr_entry *local);
        int (*address_remove)(struct mptcp_sock *msk,
                              struct mptcp_pm_addr_entry *local);
        int (*subflow_create)(struct mptcp_sock *msk,
                              struct mptcp_pm_addr_entry *local,
                              struct mptcp_addr_info *remote);
        int (*subflow_destroy)(struct mptcp_sock *msk,
                               struct mptcp_pm_addr_entry *local,
                               struct mptcp_addr_info *remote);

        /* required */
        int (*set_priority)(struct mptcp_sock *msk,
                            struct mptcp_pm_addr_entry *local,
                            struct mptcp_pm_addr_entry *remote);
};

For these three exceptions, add_addr_received is invoked in
mptcp_pm_add_addr_received, here the socket lock of ssk is already
holding.

Similarly, in subflow_chk_local_id, get_local_id and get_priority are
called, where the socket lock of ssk is already holding too.

In addition, get_local_id and get_priority are also called in
subflow_token_join_request, which is in atomic and can hold msk lock
through bh_lock_sock. If locking is required here, I can send a patch
to do this.

In this way, all callbacks of mptcp_{pm,sched}_ops can be done while
holding the msk lock or the ssk lock.

Thanks,
-Geliang

> 
> > > In other words, it means we don't need to add this
> > > "bpf_iter_task",
> > > there are other techniques, but I don't have more details, and I
> > > didn't
> > > check in the code. If it is not clear for you and you don't find 
> > > other
> > > examples elsewhere (sched_ext? check WIP patches maybe?), then
> > > Alexei
> > > said we should not hesitate to ask questions on the BPF mailing
> > > list.
> > 
> > We can send "Add mptcp_subflow bpf_iter support" v3 to the BPF
> > mailing
> > list and ask questions during the reviewing process.
> 
> Indeed. I hope to be able to find a bit of time this week to send the
> v3.
> 
> Cheers,
> Matt
Matthieu Baerts March 18, 2025, 10:54 a.m. UTC | #7
Hi Mat,

On 18/03/2025 02:25, Mat Martineau wrote:
> On Mon, 17 Mar 2025, Matthieu Baerts wrote:
> 
>> Hi Geliang,
>>
>> On 17/03/2025 11:59, Geliang Tang wrote:
>>> Hi Matt,
>>>
>>> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote:
>>>> Hi Geliang, Mat,
>>>>
>>>> On 17/03/2025 10:41, Geliang Tang wrote:
>>>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
>>>>>> From: Geliang Tang <tanggeliang@kylinos.cn>
>>>>>>
>>>>>> To make sure the mptcp_subflow bpf_iter is running in the
>>>>>> MPTCP context. This patch adds a simplified version of tracking
>>>>>> for it:
>>>>>>
>>>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct
>>>>>> mptcp_sock.
>>>>>>
>>>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling
>>>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after
>>>>>> the hook returns.
>>>>>>
>>>>>> 3. In bpf_iter_mptcp_subflow_new(), check
>>>>>>
>>>>>>     "READ_ONCE(msk->bpf_scheduler_task) == current"
>>>>>>
>>>>>> to confirm the correct task, return -EINVAL if it doesn't match.
>>>>>>
>>>>>> Also creates helpers for setting, clearing and checking that
>>>>>> value.
>>
>> (...)
>>
>>>>>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock
>>>>>> *msk)
>>>>>> +{
>>>>>> +    struct task_struct *task = READ_ONCE(msk-
>>>>>>> bpf_iter_task);
>>>>>> +
>>>>>> +    if (task && task == current)
>>>>>> +        return true;
>>>>>> +    return false;
>>>>>> +}
>>>>>
>>>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a
>>>>> loop, I
>>>>> found that the test would break in some cases. After debugging, I
>>>>> found
>>>>> that "task" and "current" were not equal:
>>>>>
>>>>> [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
>>>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
>>>>> current=0000000024db2987
>>>>>
>>>>> I will try to fix it, but haven't found a solution yet.
>>>>
>>>> (sorry for the delay, I need a bit of time to catch up)
>>>>
>>>> I talked a bit to Alexei Starovoitov last week. He told me that with
>>>> the
>>>> BPF struct_ops, it is possible to tell the verifier that some locks
>>>> are
>>>> taken either by some struct_ops types, or even per callbacks of some
>>>> specific struct_ops (WIP on sched_ext side). It is also possible to
>>>> get
>>>> some locks automatically (polymorphism), and there are examples on
>>>> VFS side.
>>>
>>> Thanks for your reminder, I will look at these BPF codes. Our goal is
>>> to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP
>>> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops
>>> are not allowed to use mptcp_subflow bpf_iter.
>>
>> Yes, that's correct. I didn't check, but **maybe** some mptcp_sched_ops
>> and mptcp_pm_ops callback might not be allowed to use bpf_iter. In this
>> case, it might be needed to allow only some of them to use bpf_iter.
>>
>> Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be
>> done while holding the msk lock, e.g. being called from the worker and
>> not from a subflow event, etc. but maybe there are some exceptions
>> needed.
> 
> Hi Matthieu -
> 
> Is this last paragraph also feedback from Alexei? Can you elaborate a
> little - is the lock needed for BPF-specific reasons or to avoid having
> the callback modify msk state?

No, it is not from Alexei, but from me. Currently, some actions on a
subflow will call some code in pm.c that will simply schedule the worker
to be called from a different context to do other actions. To convert
this to an mptcp_pm_ops, I think we should not have two callbacks -- one
to schedule the worker, and one to react from the worker context -- but
only one called from the worker context if such callback is set. This
should also simplify the BPF PMs if all/most callbacks are done from the
same context while owning the msk lock, no?

> I'll try to look around at the verifier and automatic locking techniques
> you mentioned above, it would be great to have a better way to handle
> socket locks w/ BPF.

Alexei suggested to look at the VFS code. If I understood correctly, the
idea is to have some locks automatically taken for some actions, but
without having to dedicate new kfunc, checking if they are already
taken, or requiring the BPF code to handle them (auto-releasing them
then). But I don't have more details about that.

Cheers,
Matt
Matthieu Baerts March 18, 2025, 11:26 a.m. UTC | #8
Hi Geliang,

On 18/03/2025 11:35, Geliang Tang wrote:
> Hi Matt,
> 
> On Mon, 2025-03-17 at 14:57 +0100, Matthieu Baerts wrote:
>> Hi Geliang,
>>
>> On 17/03/2025 11:59, Geliang Tang wrote:
>>> Hi Matt,
>>>
>>> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote:
>>>> Hi Geliang, Mat,
>>>>
>>>> On 17/03/2025 10:41, Geliang Tang wrote:
>>>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
>>>>>> From: Geliang Tang <tanggeliang@kylinos.cn>
>>>>>>
>>>>>> To make sure the mptcp_subflow bpf_iter is running in the
>>>>>> MPTCP context. This patch adds a simplified version of
>>>>>> tracking
>>>>>> for it:
>>>>>>
>>>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct
>>>>>> mptcp_sock.
>>>>>>
>>>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before
>>>>>> calling
>>>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL)
>>>>>> after
>>>>>> the hook returns.
>>>>>>
>>>>>> 3. In bpf_iter_mptcp_subflow_new(), check
>>>>>>
>>>>>> 	"READ_ONCE(msk->bpf_scheduler_task) == current"
>>>>>>
>>>>>> to confirm the correct task, return -EINVAL if it doesn't
>>>>>> match.
>>>>>>
>>>>>> Also creates helpers for setting, clearing and checking that
>>>>>> value.
>>
>> (...)
>>
>>>>>> +static inline bool mptcp_check_bpf_iter_task(struct
>>>>>> mptcp_sock
>>>>>> *msk)
>>>>>> +{
>>>>>> +	struct task_struct *task = READ_ONCE(msk-
>>>>>>> bpf_iter_task);
>>>>>> +
>>>>>> +	if (task && task == current)
>>>>>> +		return true;
>>>>>> +	return false;
>>>>>> +}
>>>>>
>>>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a
>>>>> loop, I
>>>>> found that the test would break in some cases. After debugging,
>>>>> I
>>>>> found
>>>>> that "task" and "current" were not equal:
>>>>>
>>>>> [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
>>>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
>>>>> current=0000000024db2987
>>>>>
>>>>> I will try to fix it, but haven't found a solution yet.
>>>>
>>>> (sorry for the delay, I need a bit of time to catch up)
>>>>
>>>> I talked a bit to Alexei Starovoitov last week. He told me that
>>>> with
>>>> the
>>>> BPF struct_ops, it is possible to tell the verifier that some
>>>> locks
>>>> are
>>>> taken either by some struct_ops types, or even per callbacks of
>>>> some
>>>> specific struct_ops (WIP on sched_ext side). It is also possible
>>>> to
>>>> get
>>>> some locks automatically (polymorphism), and there are examples
>>>> on
>>>> VFS side.
>>>
>>> Thanks for your reminder, I will look at these BPF codes. Our goal
>>> is
>>> to make mptcp_subflow bpf_iter only used by struct_ops defined by
>>> MPTCP
>>> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other
>>> struct_ops
>>> are not allowed to use mptcp_subflow bpf_iter.
> 
> I checked and found that other struct_ops **cannot** use mptcp_subflow
> bpf_iter. Although we registered this bpf_iter for use with
> BPF_PROG_TYPE_STRUCT_OPS type, we checked in bpf_iter_mptcp_subflow_new
> that sk->sk_protocol must be IPPROTO_MPTCP. Does this mean that other
> struct_ops cannot use mptcp_subflow bpf_iter successfully? I don't know
> if this check is sufficient.

Sorry, I don't know. With the current version, I don't see any links
between mptcp_subflow bpf_iter and mptcp_{sched,pm}_ops, then I don't
know how the restriction works, right? I guess there might be a
restriction because "struct mptcp_sock*" are used in arguments? But I
guess that's not enough because such structures can be obtained from
different struct_ops. Probably something else is missing to have this
link then?

>> Yes, that's correct. I didn't check, but **maybe** some
>> mptcp_sched_ops
>> and mptcp_pm_ops callback might not be allowed to use bpf_iter. In
>> this
>> case, it might be needed to allow only some of them to use bpf_iter.
> 
> I guess it's not easy to allow only some callbacks of a struct_ops to
> access certain functions, because The BPF verifier verifies the
> struct_ops as a whole. But I will continue to look for a solution.

Alexei told me that this work was in progress for sched_ext, but I don't
know more about that, sorry.

>> Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be
>> done while holding the msk lock, e.g. being called from the worker
>> and
>> not from a subflow event, etc. but maybe there are some exceptions
>> needed.
> 
> I checked the following 19 callbacks of mptcp_{pm,sched}_ops that have
> been implemented in my code tree. Except for the three exceptions
> (get_local_id, get_priority and add_addr_received), the other 17
> callbacks are all be done while holding the msk lock:

Thank you for having checked!

> struct mptcp_sched_ops {
>         int (*get_send)(struct mptcp_sock *msk);
>         int (*get_retrans)(struct mptcp_sock *msk);
> }
> 
> struct mptcp_pm_ops {
>         /* required */
>         int (*get_local_id)(struct mptcp_sock *msk,
>                             struct mptcp_pm_addr_entry *skc);
>         bool (*get_priority)(struct mptcp_sock *msk,
>                              struct mptcp_addr_info *skc);
> 
>         /* optional */
>         void (*established)(struct mptcp_sock *msk);
>         void (*subflow_established)(struct mptcp_sock *msk);
> 
>         /* required */
>         bool (*allow_new_subflow)(struct mptcp_sock *msk);
>         bool (*accept_new_subflow)(const struct mptcp_sock *msk);
>         bool (*add_addr_echo)(struct mptcp_sock *msk,
>                               const struct mptcp_addr_info *addr);
> 
>         /* optional */
>         int (*add_addr_received)(struct mptcp_sock *msk,
>                                  const struct mptcp_addr_info *addr);
>         void (*rm_addr_received)(struct mptcp_sock *msk);
> 
>         /* optional */
>         int (*add_addr)(struct mptcp_sock *msk,
>                         struct mptcp_pm_addr_entry *entry);
>         int (*del_addr)(struct mptcp_sock *msk,
>                         const struct mptcp_pm_addr_entry *entry);
>         int (*flush_addrs)(struct mptcp_sock *msk,
>                            struct list_head *rm_list);
> 
>         /* optional */
>         int (*address_announce)(struct mptcp_sock *msk,
>                                 struct mptcp_pm_addr_entry *local);
>         int (*address_remove)(struct mptcp_sock *msk,
>                               struct mptcp_pm_addr_entry *local);
>         int (*subflow_create)(struct mptcp_sock *msk,
>                               struct mptcp_pm_addr_entry *local,
>                               struct mptcp_addr_info *remote);
>         int (*subflow_destroy)(struct mptcp_sock *msk,
>                                struct mptcp_pm_addr_entry *local,
>                                struct mptcp_addr_info *remote);
> 
>         /* required */
>         int (*set_priority)(struct mptcp_sock *msk,
>                             struct mptcp_pm_addr_entry *local,
>                             struct mptcp_pm_addr_entry *remote);

Mmh, all 8 callbacks from add_addr to here seem to be linked to Netlink
commands, right? If yes, they should not be here: they don't make sense
for a BPF PM, no? BPF PMs should be configured with BPF, and not via
Netlink. Netlink is only for the built-in PMs (in-kernel and userspace PMs)

> };
> 
> For these three exceptions, add_addr_received is invoked in
> mptcp_pm_add_addr_received, here the socket lock of ssk is already
> holding.

For add_addr_received, I think it is safer to have the callback from the
worker context. In other words, when an ADD_ADDR received on a subflow:

- the ADD_ADDR echo should be sent: I don't think it is worth it letting
the other peer resending it just in case the userspace PM was not "ready"

- if pm->ops->add_addr_received is set, schedule the worker and set
  msk->pm.remote

- then pm->ops->add_addr_received will be called from the worker.

> Similarly, in subflow_chk_local_id, get_local_id and get_priority are
> called, where the socket lock of ssk is already holding too.
> 
> In addition, get_local_id and get_priority are also called in
> subflow_token_join_request, which is in atomic and can hold msk lock
> through bh_lock_sock. If locking is required here, I can send a patch
> to do this.

Yes, for the ID and backup, I guess we will need an exception there, and
a way not to let the BPF PMs calling bpf_iter. We should not hold the
msk lock here.

I guess the easier would be to ask the BPF maintainers what we should do
here. Maybe this can be done after having sent the mptcp_subflow v3
series, or at the same time. I will see what I can do.

> In this way, all callbacks of mptcp_{pm,sched}_ops can be done while
> holding the msk lock or the ssk lock.

At least on the scheduler side, that will be the case (or maybe not if
we change the API to cover more cases :) ).

Anyway, probably best to wait for BPF maintainers recommendations
instead of guessing.

Cheers,
Matt
Mat Martineau March 18, 2025, 8:09 p.m. UTC | #9
On Tue, 18 Mar 2025, Matthieu Baerts wrote:

> Hi Mat,
>
> On 18/03/2025 02:25, Mat Martineau wrote:
>> On Mon, 17 Mar 2025, Matthieu Baerts wrote:
>>
>>> Hi Geliang,
>>>
>>> On 17/03/2025 11:59, Geliang Tang wrote:
>>>> Hi Matt,
>>>>
>>>> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote:
>>>>> Hi Geliang, Mat,
>>>>>
>>>>> On 17/03/2025 10:41, Geliang Tang wrote:
>>>>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
>>>>>>> From: Geliang Tang <tanggeliang@kylinos.cn>
>>>>>>>
>>>>>>> To make sure the mptcp_subflow bpf_iter is running in the
>>>>>>> MPTCP context. This patch adds a simplified version of tracking
>>>>>>> for it:
>>>>>>>
>>>>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct
>>>>>>> mptcp_sock.
>>>>>>>
>>>>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling
>>>>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after
>>>>>>> the hook returns.
>>>>>>>
>>>>>>> 3. In bpf_iter_mptcp_subflow_new(), check
>>>>>>>
>>>>>>>     "READ_ONCE(msk->bpf_scheduler_task) == current"
>>>>>>>
>>>>>>> to confirm the correct task, return -EINVAL if it doesn't match.
>>>>>>>
>>>>>>> Also creates helpers for setting, clearing and checking that
>>>>>>> value.
>>>
>>> (...)
>>>
>>>>>>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock
>>>>>>> *msk)
>>>>>>> +{
>>>>>>> +    struct task_struct *task = READ_ONCE(msk-
>>>>>>>> bpf_iter_task);
>>>>>>> +
>>>>>>> +    if (task && task == current)
>>>>>>> +        return true;
>>>>>>> +    return false;
>>>>>>> +}
>>>>>>
>>>>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a
>>>>>> loop, I
>>>>>> found that the test would break in some cases. After debugging, I
>>>>>> found
>>>>>> that "task" and "current" were not equal:
>>>>>>
>>>>>> [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
>>>>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
>>>>>> current=0000000024db2987
>>>>>>
>>>>>> I will try to fix it, but haven't found a solution yet.
>>>>>
>>>>> (sorry for the delay, I need a bit of time to catch up)
>>>>>
>>>>> I talked a bit to Alexei Starovoitov last week. He told me that with
>>>>> the
>>>>> BPF struct_ops, it is possible to tell the verifier that some locks
>>>>> are
>>>>> taken either by some struct_ops types, or even per callbacks of some
>>>>> specific struct_ops (WIP on sched_ext side). It is also possible to
>>>>> get
>>>>> some locks automatically (polymorphism), and there are examples on
>>>>> VFS side.
>>>>
>>>> Thanks for your reminder, I will look at these BPF codes. Our goal is
>>>> to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP
>>>> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops
>>>> are not allowed to use mptcp_subflow bpf_iter.
>>>
>>> Yes, that's correct. I didn't check, but **maybe** some mptcp_sched_ops
>>> and mptcp_pm_ops callback might not be allowed to use bpf_iter. In this
>>> case, it might be needed to allow only some of them to use bpf_iter.
>>>
>>> Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be
>>> done while holding the msk lock, e.g. being called from the worker and
>>> not from a subflow event, etc. but maybe there are some exceptions
>>> needed.
>>
>> Hi Matthieu -
>>
>> Is this last paragraph also feedback from Alexei? Can you elaborate a
>> little - is the lock needed for BPF-specific reasons or to avoid having
>> the callback modify msk state?
>
> No, it is not from Alexei, but from me. Currently, some actions on a
> subflow will call some code in pm.c that will simply schedule the worker
> to be called from a different context to do other actions. To convert
> this to an mptcp_pm_ops, I think we should not have two callbacks -- one
> to schedule the worker, and one to react from the worker context -- but
> only one called from the worker context if such callback is set. This
> should also simplify the BPF PMs if all/most callbacks are done from the
> same context while owning the msk lock, no?
>

Ok, thanks for the details. I agree, for the PM callbacks it makes sense 
to only have a worker-context callback.


- Mat


>> I'll try to look around at the verifier and automatic locking techniques
>> you mentioned above, it would be great to have a better way to handle
>> socket locks w/ BPF.
>
> Alexei suggested to look at the VFS code. If I understood correctly, the
> idea is to have some locks automatically taken for some actions, but
> without having to dedicate new kfunc, checking if they are already
> taken, or requiring the BPF code to handle them (auto-releasing them
> then). But I don't have more details about that.
>
Geliang Tang March 21, 2025, 4:14 a.m. UTC | #10
Hi Matt,

On Tue, 2025-03-18 at 12:26 +0100, Matthieu Baerts wrote:
> Hi Geliang,
> 
> On 18/03/2025 11:35, Geliang Tang wrote:
> > Hi Matt,
> > 
> > On Mon, 2025-03-17 at 14:57 +0100, Matthieu Baerts wrote:
> > > Hi Geliang,
> > > 
> > > On 17/03/2025 11:59, Geliang Tang wrote:
> > > > Hi Matt,
> > > > 
> > > > On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote:
> > > > > Hi Geliang, Mat,
> > > > > 
> > > > > On 17/03/2025 10:41, Geliang Tang wrote:
> > > > > > On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote:
> > > > > > > From: Geliang Tang <tanggeliang@kylinos.cn>
> > > > > > > 
> > > > > > > To make sure the mptcp_subflow bpf_iter is running in the
> > > > > > > MPTCP context. This patch adds a simplified version of
> > > > > > > tracking
> > > > > > > for it:
> > > > > > > 
> > > > > > > 1. Add a 'struct task_struct *bpf_iter_task' field to
> > > > > > > struct
> > > > > > > mptcp_sock.
> > > > > > > 
> > > > > > > 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before
> > > > > > > calling
> > > > > > > a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task,
> > > > > > > NULL)
> > > > > > > after
> > > > > > > the hook returns.
> > > > > > > 
> > > > > > > 3. In bpf_iter_mptcp_subflow_new(), check
> > > > > > > 
> > > > > > > 	"READ_ONCE(msk->bpf_scheduler_task) == current"
> > > > > > > 
> > > > > > > to confirm the correct task, return -EINVAL if it doesn't
> > > > > > > match.
> > > > > > > 
> > > > > > > Also creates helpers for setting, clearing and checking
> > > > > > > that
> > > > > > > value.
> > > 
> > > (...)
> > > 
> > > > > > > +static inline bool mptcp_check_bpf_iter_task(struct
> > > > > > > mptcp_sock
> > > > > > > *msk)
> > > > > > > +{
> > > > > > > +	struct task_struct *task = READ_ONCE(msk-
> > > > > > > > bpf_iter_task);
> > > > > > > +
> > > > > > > +	if (task && task == current)
> > > > > > > +		return true;
> > > > > > > +	return false;
> > > > > > > +}
> > > > > > 
> > > > > > This v3 has a bug. When I was testing MPTCP BPF selftests
> > > > > > in a
> > > > > > loop, I
> > > > > > found that the test would break in some cases. After
> > > > > > debugging,
> > > > > > I
> > > > > > found
> > > > > > that "task" and "current" were not equal:
> > > > > > 
> > > > > > [  520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new
> > > > > > msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f
> > > > > > current=0000000024db2987
> > > > > > 
> > > > > > I will try to fix it, but haven't found a solution yet.
> > > > > 
> > > > > (sorry for the delay, I need a bit of time to catch up)
> > > > > 
> > > > > I talked a bit to Alexei Starovoitov last week. He told me
> > > > > that
> > > > > with
> > > > > the
> > > > > BPF struct_ops, it is possible to tell the verifier that some
> > > > > locks
> > > > > are
> > > > > taken either by some struct_ops types, or even per callbacks
> > > > > of
> > > > > some
> > > > > specific struct_ops (WIP on sched_ext side). It is also
> > > > > possible
> > > > > to
> > > > > get
> > > > > some locks automatically (polymorphism), and there are
> > > > > examples
> > > > > on
> > > > > VFS side.
> > > > 
> > > > Thanks for your reminder, I will look at these BPF codes. Our
> > > > goal
> > > > is
> > > > to make mptcp_subflow bpf_iter only used by struct_ops defined
> > > > by
> > > > MPTCP
> > > > BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other
> > > > struct_ops
> > > > are not allowed to use mptcp_subflow bpf_iter.
> > 
> > I checked and found that other struct_ops **cannot** use
> > mptcp_subflow
> > bpf_iter. Although we registered this bpf_iter for use with
> > BPF_PROG_TYPE_STRUCT_OPS type, we checked in
> > bpf_iter_mptcp_subflow_new
> > that sk->sk_protocol must be IPPROTO_MPTCP. Does this mean that
> > other
> > struct_ops cannot use mptcp_subflow bpf_iter successfully? I don't
> > know
> > if this check is sufficient.
> 
> Sorry, I don't know. With the current version, I don't see any links
> between mptcp_subflow bpf_iter and mptcp_{sched,pm}_ops, then I don't
> know how the restriction works, right? I guess there might be a
> restriction because "struct mptcp_sock*" are used in arguments? But I
> guess that's not enough because such structures can be obtained from
> different struct_ops. Probably something else is missing to have this
> link then?
> 
> > > Yes, that's correct. I didn't check, but **maybe** some
> > > mptcp_sched_ops
> > > and mptcp_pm_ops callback might not be allowed to use bpf_iter.
> > > In
> > > this
> > > case, it might be needed to allow only some of them to use
> > > bpf_iter.
> > 
> > I guess it's not easy to allow only some callbacks of a struct_ops
> > to
> > access certain functions, because The BPF verifier verifies the
> > struct_ops as a whole. But I will continue to look for a solution.
> 
> Alexei told me that this work was in progress for sched_ext, but I
> don't
> know more about that, sorry.
> 
> > > Note that it sounds like all mptcp_{pm,sched}_ops callbacks
> > > should be
> > > done while holding the msk lock, e.g. being called from the
> > > worker
> > > and
> > > not from a subflow event, etc. but maybe there are some
> > > exceptions
> > > needed.
> > 
> > I checked the following 19 callbacks of mptcp_{pm,sched}_ops that
> > have
> > been implemented in my code tree. Except for the three exceptions
> > (get_local_id, get_priority and add_addr_received), the other 17
> > callbacks are all be done while holding the msk lock:
> 
> Thank you for having checked!
> 
> > struct mptcp_sched_ops {
> >         int (*get_send)(struct mptcp_sock *msk);
> >         int (*get_retrans)(struct mptcp_sock *msk);
> > }
> > 
> > struct mptcp_pm_ops {
> >         /* required */
> >         int (*get_local_id)(struct mptcp_sock *msk,
> >                             struct mptcp_pm_addr_entry *skc);
> >         bool (*get_priority)(struct mptcp_sock *msk,
> >                              struct mptcp_addr_info *skc);
> > 
> >         /* optional */
> >         void (*established)(struct mptcp_sock *msk);
> >         void (*subflow_established)(struct mptcp_sock *msk);
> > 
> >         /* required */
> >         bool (*allow_new_subflow)(struct mptcp_sock *msk);
> >         bool (*accept_new_subflow)(const struct mptcp_sock *msk);
> >         bool (*add_addr_echo)(struct mptcp_sock *msk,
> >                               const struct mptcp_addr_info *addr);
> > 
> >         /* optional */
> >         int (*add_addr_received)(struct mptcp_sock *msk,
> >                                  const struct mptcp_addr_info
> > *addr);
> >         void (*rm_addr_received)(struct mptcp_sock *msk);
> > 
> >         /* optional */
> >         int (*add_addr)(struct mptcp_sock *msk,
> >                         struct mptcp_pm_addr_entry *entry);
> >         int (*del_addr)(struct mptcp_sock *msk,
> >                         const struct mptcp_pm_addr_entry *entry);
> >         int (*flush_addrs)(struct mptcp_sock *msk,
> >                            struct list_head *rm_list);
> > 
> >         /* optional */
> >         int (*address_announce)(struct mptcp_sock *msk,
> >                                 struct mptcp_pm_addr_entry *local);
> >         int (*address_remove)(struct mptcp_sock *msk,
> >                               struct mptcp_pm_addr_entry *local);
> >         int (*subflow_create)(struct mptcp_sock *msk,
> >                               struct mptcp_pm_addr_entry *local,
> >                               struct mptcp_addr_info *remote);
> >         int (*subflow_destroy)(struct mptcp_sock *msk,
> >                                struct mptcp_pm_addr_entry *local,
> >                                struct mptcp_addr_info *remote);
> > 
> >         /* required */
> >         int (*set_priority)(struct mptcp_sock *msk,
> >                             struct mptcp_pm_addr_entry *local,
> >                             struct mptcp_pm_addr_entry *remote);
> 
> Mmh, all 8 callbacks from add_addr to here seem to be linked to
> Netlink
> commands, right? If yes, they should not be here: they don't make
> sense
> for a BPF PM, no? BPF PMs should be configured with BPF, and not via

Do you have any idea how we can pass commands like ADD_ADDR and
addresses from user space to BPF?

What I can think of is passing through sockopt, the user space passes
the command and the address through setsockopt, and the BPF program
handles it through the custom "cgroup/setsockopt". I would like to hear
your opinions. If you also agree to use sockopt to implement it, I can
write a test program for this.

Thanks,
-Geliang

> Netlink. Netlink is only for the built-in PMs (in-kernel and
> userspace PMs)
> 
> > };
> > 
> > For these three exceptions, add_addr_received is invoked in
> > mptcp_pm_add_addr_received, here the socket lock of ssk is already
> > holding.
> 
> For add_addr_received, I think it is safer to have the callback from
> the
> worker context. In other words, when an ADD_ADDR received on a
> subflow:
> 
> - the ADD_ADDR echo should be sent: I don't think it is worth it
> letting
> the other peer resending it just in case the userspace PM was not
> "ready"
> 
> - if pm->ops->add_addr_received is set, schedule the worker and set
>   msk->pm.remote
> 
> - then pm->ops->add_addr_received will be called from the worker.
> 
> > Similarly, in subflow_chk_local_id, get_local_id and get_priority
> > are
> > called, where the socket lock of ssk is already holding too.
> > 
> > In addition, get_local_id and get_priority are also called in
> > subflow_token_join_request, which is in atomic and can hold msk
> > lock
> > through bh_lock_sock. If locking is required here, I can send a
> > patch
> > to do this.
> 
> Yes, for the ID and backup, I guess we will need an exception there,
> and
> a way not to let the BPF PMs calling bpf_iter. We should not hold the
> msk lock here.
> 
> I guess the easier would be to ask the BPF maintainers what we should
> do
> here. Maybe this can be done after having sent the mptcp_subflow v3
> series, or at the same time. I will see what I can do.
> 
> > In this way, all callbacks of mptcp_{pm,sched}_ops can be done
> > while
> > holding the msk lock or the ssk lock.
> 
> At least on the scheduler side, that will be the case (or maybe not
> if
> we change the API to cover more cases :) ).
> 
> Anyway, probably best to wait for BPF maintainers recommendations
> instead of guessing.
> 
> Cheers,
> Matt
Matthieu Baerts March 21, 2025, 9:19 a.m. UTC | #11
Hi Geliang,

On 21/03/2025 05:14, Geliang Tang wrote:
> On Tue, 2025-03-18 at 12:26 +0100, Matthieu Baerts wrote:

(...)

>> Mmh, all 8 callbacks from add_addr to here seem to be linked to
>> Netlink
>> commands, right? If yes, they should not be here: they don't make
>> sense
>> for a BPF PM, no? BPF PMs should be configured with BPF, and not via
>
> Do you have any idea how we can pass commands like ADD_ADDR and
> addresses from user space to BPF?

The userspace can inject any kind of BPF program. Additional addresses
can then be hardcoded in this program, or it can load them from eBPF
maps, or read skel->bss->(...), etc.

Netlink is then not needed.

> What I can think of is passing through sockopt, the user space passes
> the command and the address through setsockopt, and the BPF program
> handles it through the custom "cgroup/setsockopt". I would like to hear
> your opinions. If you also agree to use sockopt to implement it, I can
> write a test program for this.
That's a good idea for userspace apps which want to control how the
MPTCP PM is acting. I guess MPTCP BPF PM will generally not be linked to
network apps: a separate application dedicated to inject and configure
the BPF PM, no?

Cheers,
Matt
Geliang Tang March 21, 2025, 10:05 a.m. UTC | #12
Hi Matt,

On Fri, 2025-03-21 at 10:19 +0100, Matthieu Baerts wrote:
> Hi Geliang,
> 
> On 21/03/2025 05:14, Geliang Tang wrote:
> > On Tue, 2025-03-18 at 12:26 +0100, Matthieu Baerts wrote:
> 
> (...)
> 
> > > Mmh, all 8 callbacks from add_addr to here seem to be linked to
> > > Netlink
> > > commands, right? If yes, they should not be here: they don't make
> > > sense
> > > for a BPF PM, no? BPF PMs should be configured with BPF, and not
> > > via
> > 
> > Do you have any idea how we can pass commands like ADD_ADDR and
> > addresses from user space to BPF?
> 
> The userspace can inject any kind of BPF program. Additional
> addresses
> can then be hardcoded in this program, or it can load them from eBPF
> maps, or read skel->bss->(...), etc.
> 
> Netlink is then not needed.
> 
> > What I can think of is passing through sockopt, the user space
> > passes
> > the command and the address through setsockopt, and the BPF program
> > handles it through the custom "cgroup/setsockopt". I would like to
> > hear
> > your opinions. If you also agree to use sockopt to implement it, I
> > can
> > write a test program for this.

I just tested it and found it difficult to use setsockopt.

"cgroup/setsockopt" program doesn't work. We need to invoke sleepable
kfuncs in BPF PM:

BTF_ID_FLAGS(func, mptcp_pm_remove_addr_entry, KF_SLEEPABLE)
BTF_ID_FLAGS(func, mptcp_pm_addr_send_ack, KF_SLEEPABLE)
BTF_ID_FLAGS(func, mptcp_pm_mp_prio_send_ack, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_mptcp_subflow_connect, KF_SLEEPABLE)
BTF_ID_FLAGS(func, mptcp_subflow_shutdown, KF_SLEEPABLE)
BTF_ID_FLAGS(func, mptcp_close_ssk, KF_SLEEPABLE)

They can't be invoked in "cgroup/setsockopt" program. And
"cgroup/setsockopt" program can't be set with BPF_F_SLEEPABLE flag:

        err = err ?: bpf_program__set_flags(skel->progs.pm_setsockopt,
                                            BPF_F_SLEEPABLE);

With this error occurs:

test_bpf_hashmap_pm:PASS:set sleepable flags 0 nsec
libbpf: prog 'pm_setsockopt': BPF program load failed: -EINVAL
libbpf: prog 'pm_setsockopt': -- BEGIN PROG LOAD LOG --
Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs
can be sleepable
processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0
peak_states 0 mark_read 0

It seems that we can only choose one of fentry/fexit/fmod_ret, lsm,
iter, uprobe, or struct_ops type to implement BPF PM.

Thanks,
-Geliang

> That's a good idea for userspace apps which want to control how the
> MPTCP PM is acting. I guess MPTCP BPF PM will generally not be linked
> to
> network apps: a separate application dedicated to inject and
> configure
> the BPF PM, no?
> 
> Cheers,
> Matt
Matthieu Baerts March 21, 2025, 10:16 a.m. UTC | #13
On 21/03/2025 11:05, Geliang Tang wrote:
> Hi Matt,
> 
> On Fri, 2025-03-21 at 10:19 +0100, Matthieu Baerts wrote:
>> Hi Geliang,
>>
>> On 21/03/2025 05:14, Geliang Tang wrote:
>>> On Tue, 2025-03-18 at 12:26 +0100, Matthieu Baerts wrote:
>>
>> (...)
>>
>>>> Mmh, all 8 callbacks from add_addr to here seem to be linked to
>>>> Netlink
>>>> commands, right? If yes, they should not be here: they don't make
>>>> sense
>>>> for a BPF PM, no? BPF PMs should be configured with BPF, and not
>>>> via
>>>
>>> Do you have any idea how we can pass commands like ADD_ADDR and
>>> addresses from user space to BPF?
>>
>> The userspace can inject any kind of BPF program. Additional
>> addresses
>> can then be hardcoded in this program, or it can load them from eBPF
>> maps, or read skel->bss->(...), etc.
>>
>> Netlink is then not needed.
>>
>>> What I can think of is passing through sockopt, the user space
>>> passes
>>> the command and the address through setsockopt, and the BPF program
>>> handles it through the custom "cgroup/setsockopt". I would like to
>>> hear
>>> your opinions. If you also agree to use sockopt to implement it, I
>>> can
>>> write a test program for this.
> 
> I just tested it and found it difficult to use setsockopt.
> 
> "cgroup/setsockopt" program doesn't work. We need to invoke sleepable
> kfuncs in BPF PM:

(...)

> It seems that we can only choose one of fentry/fexit/fmod_ret, lsm,
> iter, uprobe, or struct_ops type to implement BPF PM.
Thank you for having checked. I think we can look at that aspect later.
I don't think many apps will want this kind of control. They can always
set something in a map, and the BPF program can look at it when needed.

Cheers,
Matt
diff mbox series

Patch

diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
index c0da9ac077e4..0a78604742c7 100644
--- a/net/mptcp/bpf.c
+++ b/net/mptcp/bpf.c
@@ -261,6 +261,8 @@  bpf_iter_mptcp_subflow_new(struct bpf_iter_mptcp_subflow *it,
 		return -EINVAL;
 
 	msk = mptcp_sk(sk);
+	if (!mptcp_check_bpf_iter_task(msk))
+		return -EINVAL;
 
 	msk_owned_by_me(msk);
 
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 01157ad2e2dc..d98e48ce8cd8 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2729,6 +2729,7 @@  static void __mptcp_init_sock(struct sock *sk)
 	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
 	WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
 	WRITE_ONCE(msk->allow_infinite_fallback, true);
+	mptcp_clear_bpf_iter_task(msk);
 	msk->recovery = false;
 	msk->subflow_id = 1;
 	msk->last_data_sent = tcp_jiffies32;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 3492b256ecba..1c6958d64291 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -334,6 +334,7 @@  struct mptcp_sock {
 				 */
 	struct mptcp_pm_data	pm;
 	struct mptcp_sched_ops	*sched;
+	struct task_struct *bpf_iter_task;
 	struct {
 		u32	space;	/* bytes copied in last measurement window */
 		u32	copied; /* bytes copied in this measurement window */
@@ -1291,4 +1292,23 @@  mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_re
 static inline void mptcp_join_cookie_init(void) {}
 #endif
 
+static inline void mptcp_set_bpf_iter_task(struct mptcp_sock *msk)
+{
+	WRITE_ONCE(msk->bpf_iter_task, current);
+}
+
+static inline void mptcp_clear_bpf_iter_task(struct mptcp_sock *msk)
+{
+	WRITE_ONCE(msk->bpf_iter_task, NULL);
+}
+
+static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock *msk)
+{
+	struct task_struct *task = READ_ONCE(msk->bpf_iter_task);
+
+	if (task && task == current)
+		return true;
+	return false;
+}
+
 #endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index f09f7eb1d63f..161398f8960c 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -155,6 +155,7 @@  void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
 int mptcp_sched_get_send(struct mptcp_sock *msk)
 {
 	struct mptcp_subflow_context *subflow;
+	int ret;
 
 	msk_owned_by_me(msk);
 
@@ -176,12 +177,16 @@  int mptcp_sched_get_send(struct mptcp_sock *msk)
 
 	if (msk->sched == &mptcp_sched_default || !msk->sched)
 		return mptcp_sched_default_get_send(msk);
-	return msk->sched->get_send(msk);
+	mptcp_set_bpf_iter_task(msk);
+	ret = msk->sched->get_send(msk);
+	mptcp_clear_bpf_iter_task(msk);
+	return ret;
 }
 
 int mptcp_sched_get_retrans(struct mptcp_sock *msk)
 {
 	struct mptcp_subflow_context *subflow;
+	int ret;
 
 	msk_owned_by_me(msk);
 
@@ -196,7 +201,9 @@  int mptcp_sched_get_retrans(struct mptcp_sock *msk)
 
 	if (msk->sched == &mptcp_sched_default || !msk->sched)
 		return mptcp_sched_default_get_retrans(msk);
-	if (msk->sched->get_retrans)
-		return msk->sched->get_retrans(msk);
-	return msk->sched->get_send(msk);
+	mptcp_set_bpf_iter_task(msk);
+	ret = msk->sched->get_retrans ? msk->sched->get_retrans(msk) :
+					msk->sched->get_send(msk);
+	mptcp_clear_bpf_iter_task(msk);
+	return ret;
 }