Message ID | e04cc2fa5f500097ac7d95afb94f68d0d8dc0057.1741577149.git.tanggeliang@kylinos.cn (mailing list archive) |
---|---|
State | Rejected, archived |
Headers | show |
Series | add bpf_iter_task | expand |
Context | Check | Description |
---|---|---|
matttbe/build | success | Build and static analysis OK |
matttbe/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 81 lines checked |
matttbe/shellcheck | success | MPTCP selftests files have not been modified |
matttbe/KVM_Validation__normal | success | Success! ✅ |
matttbe/KVM_Validation__debug | fail | Critical: Global Timeout ❌ |
matttbe/KVM_Validation__btf-normal__only_bpftest_all_ | success | Success! ✅ |
matttbe/KVM_Validation__btf-debug__only_bpftest_all_ | success | Success! ✅ |
On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: > From: Geliang Tang <tanggeliang@kylinos.cn> > > To make sure the mptcp_subflow bpf_iter is running in the > MPTCP context. This patch adds a simplified version of tracking > for it: > > 1. Add a 'struct task_struct *bpf_iter_task' field to struct > mptcp_sock. > > 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling > a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after > the hook returns. > > 3. In bpf_iter_mptcp_subflow_new(), check > > "READ_ONCE(msk->bpf_scheduler_task) == current" > > to confirm the correct task, return -EINVAL if it doesn't match. > > Also creates helpers for setting, clearing and checking that value. > > Suggested-by: Mat Martineau <martineau@kernel.org> > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn> > --- > net/mptcp/bpf.c | 2 ++ > net/mptcp/protocol.c | 1 + > net/mptcp/protocol.h | 20 ++++++++++++++++++++ > net/mptcp/sched.c | 15 +++++++++++---- > 4 files changed, 34 insertions(+), 4 deletions(-) > > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c > index c0da9ac077e4..0a78604742c7 100644 > --- a/net/mptcp/bpf.c > +++ b/net/mptcp/bpf.c > @@ -261,6 +261,8 @@ bpf_iter_mptcp_subflow_new(struct > bpf_iter_mptcp_subflow *it, > return -EINVAL; > > msk = mptcp_sk(sk); > + if (!mptcp_check_bpf_iter_task(msk)) > + return -EINVAL; > > msk_owned_by_me(msk); > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > index 01157ad2e2dc..d98e48ce8cd8 100644 > --- a/net/mptcp/protocol.c > +++ b/net/mptcp/protocol.c > @@ -2729,6 +2729,7 @@ static void __mptcp_init_sock(struct sock *sk) > inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; > WRITE_ONCE(msk->csum_enabled, > mptcp_is_checksum_enabled(sock_net(sk))); > WRITE_ONCE(msk->allow_infinite_fallback, true); > + mptcp_clear_bpf_iter_task(msk); > msk->recovery = false; > msk->subflow_id = 1; > msk->last_data_sent = tcp_jiffies32; > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h > index 3492b256ecba..1c6958d64291 100644 > --- a/net/mptcp/protocol.h > +++ b/net/mptcp/protocol.h > @@ -334,6 +334,7 @@ struct mptcp_sock { > */ > struct mptcp_pm_data pm; > struct mptcp_sched_ops *sched; > + struct task_struct *bpf_iter_task; > struct { > u32 space; /* bytes copied in last measurement > window */ > u32 copied; /* bytes copied in this measurement > window */ > @@ -1291,4 +1292,23 @@ mptcp_token_join_cookie_init_state(struct > mptcp_subflow_request_sock *subflow_re > static inline void mptcp_join_cookie_init(void) {} > #endif > > +static inline void mptcp_set_bpf_iter_task(struct mptcp_sock *msk) > +{ > + WRITE_ONCE(msk->bpf_iter_task, current); > +} > + > +static inline void mptcp_clear_bpf_iter_task(struct mptcp_sock *msk) > +{ > + WRITE_ONCE(msk->bpf_iter_task, NULL); > +} > + > +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock *msk) > +{ > + struct task_struct *task = READ_ONCE(msk->bpf_iter_task); > + > + if (task && task == current) > + return true; > + return false; > +} This v3 has a bug. When I was testing MPTCP BPF selftests in a loop, I found that the test would break in some cases. After debugging, I found that "task" and "current" were not equal: [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f current=0000000024db2987 I will try to fix it, but haven't found a solution yet. Thanks, -Geliang > + > #endif /* __MPTCP_PROTOCOL_H */ > diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c > index f09f7eb1d63f..161398f8960c 100644 > --- a/net/mptcp/sched.c > +++ b/net/mptcp/sched.c > @@ -155,6 +155,7 @@ void mptcp_subflow_set_scheduled(struct > mptcp_subflow_context *subflow, > int mptcp_sched_get_send(struct mptcp_sock *msk) > { > struct mptcp_subflow_context *subflow; > + int ret; > > msk_owned_by_me(msk); > > @@ -176,12 +177,16 @@ int mptcp_sched_get_send(struct mptcp_sock > *msk) > > if (msk->sched == &mptcp_sched_default || !msk->sched) > return mptcp_sched_default_get_send(msk); > - return msk->sched->get_send(msk); > + mptcp_set_bpf_iter_task(msk); > + ret = msk->sched->get_send(msk); > + mptcp_clear_bpf_iter_task(msk); > + return ret; > } > > int mptcp_sched_get_retrans(struct mptcp_sock *msk) > { > struct mptcp_subflow_context *subflow; > + int ret; > > msk_owned_by_me(msk); > > @@ -196,7 +201,9 @@ int mptcp_sched_get_retrans(struct mptcp_sock > *msk) > > if (msk->sched == &mptcp_sched_default || !msk->sched) > return mptcp_sched_default_get_retrans(msk); > - if (msk->sched->get_retrans) > - return msk->sched->get_retrans(msk); > - return msk->sched->get_send(msk); > + mptcp_set_bpf_iter_task(msk); > + ret = msk->sched->get_retrans ? msk->sched->get_retrans(msk) > : > + msk->sched->get_send(msk); > + mptcp_clear_bpf_iter_task(msk); > + return ret; > }
Hi Geliang, Mat, On 17/03/2025 10:41, Geliang Tang wrote: > On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: >> From: Geliang Tang <tanggeliang@kylinos.cn> >> >> To make sure the mptcp_subflow bpf_iter is running in the >> MPTCP context. This patch adds a simplified version of tracking >> for it: >> >> 1. Add a 'struct task_struct *bpf_iter_task' field to struct >> mptcp_sock. >> >> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling >> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after >> the hook returns. >> >> 3. In bpf_iter_mptcp_subflow_new(), check >> >> "READ_ONCE(msk->bpf_scheduler_task) == current" >> >> to confirm the correct task, return -EINVAL if it doesn't match. >> >> Also creates helpers for setting, clearing and checking that value. >> >> Suggested-by: Mat Martineau <martineau@kernel.org> >> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn> >> --- >> net/mptcp/bpf.c | 2 ++ >> net/mptcp/protocol.c | 1 + >> net/mptcp/protocol.h | 20 ++++++++++++++++++++ >> net/mptcp/sched.c | 15 +++++++++++---- >> 4 files changed, 34 insertions(+), 4 deletions(-) >> >> diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c >> index c0da9ac077e4..0a78604742c7 100644 >> --- a/net/mptcp/bpf.c >> +++ b/net/mptcp/bpf.c >> @@ -261,6 +261,8 @@ bpf_iter_mptcp_subflow_new(struct >> bpf_iter_mptcp_subflow *it, >> return -EINVAL; >> >> msk = mptcp_sk(sk); >> + if (!mptcp_check_bpf_iter_task(msk)) >> + return -EINVAL; >> >> msk_owned_by_me(msk); >> >> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c >> index 01157ad2e2dc..d98e48ce8cd8 100644 >> --- a/net/mptcp/protocol.c >> +++ b/net/mptcp/protocol.c >> @@ -2729,6 +2729,7 @@ static void __mptcp_init_sock(struct sock *sk) >> inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; >> WRITE_ONCE(msk->csum_enabled, >> mptcp_is_checksum_enabled(sock_net(sk))); >> WRITE_ONCE(msk->allow_infinite_fallback, true); >> + mptcp_clear_bpf_iter_task(msk); >> msk->recovery = false; >> msk->subflow_id = 1; >> msk->last_data_sent = tcp_jiffies32; >> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h >> index 3492b256ecba..1c6958d64291 100644 >> --- a/net/mptcp/protocol.h >> +++ b/net/mptcp/protocol.h >> @@ -334,6 +334,7 @@ struct mptcp_sock { >> */ >> struct mptcp_pm_data pm; >> struct mptcp_sched_ops *sched; >> + struct task_struct *bpf_iter_task; >> struct { >> u32 space; /* bytes copied in last measurement >> window */ >> u32 copied; /* bytes copied in this measurement >> window */ >> @@ -1291,4 +1292,23 @@ mptcp_token_join_cookie_init_state(struct >> mptcp_subflow_request_sock *subflow_re >> static inline void mptcp_join_cookie_init(void) {} >> #endif >> >> +static inline void mptcp_set_bpf_iter_task(struct mptcp_sock *msk) >> +{ >> + WRITE_ONCE(msk->bpf_iter_task, current); >> +} >> + >> +static inline void mptcp_clear_bpf_iter_task(struct mptcp_sock *msk) >> +{ >> + WRITE_ONCE(msk->bpf_iter_task, NULL); >> +} >> + >> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock *msk) >> +{ >> + struct task_struct *task = READ_ONCE(msk->bpf_iter_task); >> + >> + if (task && task == current) >> + return true; >> + return false; >> +} > > This v3 has a bug. When I was testing MPTCP BPF selftests in a loop, I > found that the test would break in some cases. After debugging, I found > that "task" and "current" were not equal: > > [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new > msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f > current=0000000024db2987 > > I will try to fix it, but haven't found a solution yet. (sorry for the delay, I need a bit of time to catch up) I talked a bit to Alexei Starovoitov last week. He told me that with the BPF struct_ops, it is possible to tell the verifier that some locks are taken either by some struct_ops types, or even per callbacks of some specific struct_ops (WIP on sched_ext side). It is also possible to get some locks automatically (polymorphism), and there are examples on VFS side. In other words, it means we don't need to add this "bpf_iter_task", there are other techniques, but I don't have more details, and I didn't check in the code. If it is not clear for you and you don't find other examples elsewhere (sched_ext? check WIP patches maybe?), then Alexei said we should not hesitate to ask questions on the BPF mailing list. Cheers, Matt
Hi Matt, On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote: > Hi Geliang, Mat, > > On 17/03/2025 10:41, Geliang Tang wrote: > > On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: > > > From: Geliang Tang <tanggeliang@kylinos.cn> > > > > > > To make sure the mptcp_subflow bpf_iter is running in the > > > MPTCP context. This patch adds a simplified version of tracking > > > for it: > > > > > > 1. Add a 'struct task_struct *bpf_iter_task' field to struct > > > mptcp_sock. > > > > > > 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling > > > a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after > > > the hook returns. > > > > > > 3. In bpf_iter_mptcp_subflow_new(), check > > > > > > "READ_ONCE(msk->bpf_scheduler_task) == current" > > > > > > to confirm the correct task, return -EINVAL if it doesn't match. > > > > > > Also creates helpers for setting, clearing and checking that > > > value. > > > > > > Suggested-by: Mat Martineau <martineau@kernel.org> > > > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn> > > > --- > > > net/mptcp/bpf.c | 2 ++ > > > net/mptcp/protocol.c | 1 + > > > net/mptcp/protocol.h | 20 ++++++++++++++++++++ > > > net/mptcp/sched.c | 15 +++++++++++---- > > > 4 files changed, 34 insertions(+), 4 deletions(-) > > > > > > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c > > > index c0da9ac077e4..0a78604742c7 100644 > > > --- a/net/mptcp/bpf.c > > > +++ b/net/mptcp/bpf.c > > > @@ -261,6 +261,8 @@ bpf_iter_mptcp_subflow_new(struct > > > bpf_iter_mptcp_subflow *it, > > > return -EINVAL; > > > > > > msk = mptcp_sk(sk); > > > + if (!mptcp_check_bpf_iter_task(msk)) > > > + return -EINVAL; > > > > > > msk_owned_by_me(msk); > > > > > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > > > index 01157ad2e2dc..d98e48ce8cd8 100644 > > > --- a/net/mptcp/protocol.c > > > +++ b/net/mptcp/protocol.c > > > @@ -2729,6 +2729,7 @@ static void __mptcp_init_sock(struct sock > > > *sk) > > > inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; > > > WRITE_ONCE(msk->csum_enabled, > > > mptcp_is_checksum_enabled(sock_net(sk))); > > > WRITE_ONCE(msk->allow_infinite_fallback, true); > > > + mptcp_clear_bpf_iter_task(msk); > > > msk->recovery = false; > > > msk->subflow_id = 1; > > > msk->last_data_sent = tcp_jiffies32; > > > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h > > > index 3492b256ecba..1c6958d64291 100644 > > > --- a/net/mptcp/protocol.h > > > +++ b/net/mptcp/protocol.h > > > @@ -334,6 +334,7 @@ struct mptcp_sock { > > > */ > > > struct mptcp_pm_data pm; > > > struct mptcp_sched_ops *sched; > > > + struct task_struct *bpf_iter_task; > > > struct { > > > u32 space; /* bytes copied in last > > > measurement > > > window */ > > > u32 copied; /* bytes copied in this > > > measurement > > > window */ > > > @@ -1291,4 +1292,23 @@ mptcp_token_join_cookie_init_state(struct > > > mptcp_subflow_request_sock *subflow_re > > > static inline void mptcp_join_cookie_init(void) {} > > > #endif > > > > > > +static inline void mptcp_set_bpf_iter_task(struct mptcp_sock > > > *msk) > > > +{ > > > + WRITE_ONCE(msk->bpf_iter_task, current); > > > +} > > > + > > > +static inline void mptcp_clear_bpf_iter_task(struct mptcp_sock > > > *msk) > > > +{ > > > + WRITE_ONCE(msk->bpf_iter_task, NULL); > > > +} > > > + > > > +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock > > > *msk) > > > +{ > > > + struct task_struct *task = READ_ONCE(msk- > > > >bpf_iter_task); > > > + > > > + if (task && task == current) > > > + return true; > > > + return false; > > > +} > > > > This v3 has a bug. When I was testing MPTCP BPF selftests in a > > loop, I > > found that the test would break in some cases. After debugging, I > > found > > that "task" and "current" were not equal: > > > > [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new > > msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f > > current=0000000024db2987 > > > > I will try to fix it, but haven't found a solution yet. > > (sorry for the delay, I need a bit of time to catch up) > > I talked a bit to Alexei Starovoitov last week. He told me that with > the > BPF struct_ops, it is possible to tell the verifier that some locks > are > taken either by some struct_ops types, or even per callbacks of some > specific struct_ops (WIP on sched_ext side). It is also possible to > get > some locks automatically (polymorphism), and there are examples on > VFS side. Thanks for your reminder, I will look at these BPF codes. Our goal is to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops are not allowed to use mptcp_subflow bpf_iter. > > In other words, it means we don't need to add this "bpf_iter_task", > there are other techniques, but I don't have more details, and I > didn't > check in the code. If it is not clear for you and you don't find > other > examples elsewhere (sched_ext? check WIP patches maybe?), then Alexei > said we should not hesitate to ask questions on the BPF mailing list. We can send "Add mptcp_subflow bpf_iter support" v3 to the BPF mailing list and ask questions during the reviewing process. Thanks, -Geliang > > Cheers, > Matt
Hi Geliang, On 17/03/2025 11:59, Geliang Tang wrote: > Hi Matt, > > On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote: >> Hi Geliang, Mat, >> >> On 17/03/2025 10:41, Geliang Tang wrote: >>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: >>>> From: Geliang Tang <tanggeliang@kylinos.cn> >>>> >>>> To make sure the mptcp_subflow bpf_iter is running in the >>>> MPTCP context. This patch adds a simplified version of tracking >>>> for it: >>>> >>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct >>>> mptcp_sock. >>>> >>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling >>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after >>>> the hook returns. >>>> >>>> 3. In bpf_iter_mptcp_subflow_new(), check >>>> >>>> "READ_ONCE(msk->bpf_scheduler_task) == current" >>>> >>>> to confirm the correct task, return -EINVAL if it doesn't match. >>>> >>>> Also creates helpers for setting, clearing and checking that >>>> value. (...) >>>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock >>>> *msk) >>>> +{ >>>> + struct task_struct *task = READ_ONCE(msk- >>>>> bpf_iter_task); >>>> + >>>> + if (task && task == current) >>>> + return true; >>>> + return false; >>>> +} >>> >>> This v3 has a bug. When I was testing MPTCP BPF selftests in a >>> loop, I >>> found that the test would break in some cases. After debugging, I >>> found >>> that "task" and "current" were not equal: >>> >>> [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new >>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f >>> current=0000000024db2987 >>> >>> I will try to fix it, but haven't found a solution yet. >> >> (sorry for the delay, I need a bit of time to catch up) >> >> I talked a bit to Alexei Starovoitov last week. He told me that with >> the >> BPF struct_ops, it is possible to tell the verifier that some locks >> are >> taken either by some struct_ops types, or even per callbacks of some >> specific struct_ops (WIP on sched_ext side). It is also possible to >> get >> some locks automatically (polymorphism), and there are examples on >> VFS side. > > Thanks for your reminder, I will look at these BPF codes. Our goal is > to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP > BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops > are not allowed to use mptcp_subflow bpf_iter. Yes, that's correct. I didn't check, but **maybe** some mptcp_sched_ops and mptcp_pm_ops callback might not be allowed to use bpf_iter. In this case, it might be needed to allow only some of them to use bpf_iter. Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be done while holding the msk lock, e.g. being called from the worker and not from a subflow event, etc. but maybe there are some exceptions needed. >> In other words, it means we don't need to add this "bpf_iter_task", >> there are other techniques, but I don't have more details, and I >> didn't >> check in the code. If it is not clear for you and you don't find >> other >> examples elsewhere (sched_ext? check WIP patches maybe?), then Alexei >> said we should not hesitate to ask questions on the BPF mailing list. > > We can send "Add mptcp_subflow bpf_iter support" v3 to the BPF mailing > list and ask questions during the reviewing process. Indeed. I hope to be able to find a bit of time this week to send the v3. Cheers, Matt
On Mon, 17 Mar 2025, Matthieu Baerts wrote: > Hi Geliang, > > On 17/03/2025 11:59, Geliang Tang wrote: >> Hi Matt, >> >> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote: >>> Hi Geliang, Mat, >>> >>> On 17/03/2025 10:41, Geliang Tang wrote: >>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: >>>>> From: Geliang Tang <tanggeliang@kylinos.cn> >>>>> >>>>> To make sure the mptcp_subflow bpf_iter is running in the >>>>> MPTCP context. This patch adds a simplified version of tracking >>>>> for it: >>>>> >>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct >>>>> mptcp_sock. >>>>> >>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling >>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after >>>>> the hook returns. >>>>> >>>>> 3. In bpf_iter_mptcp_subflow_new(), check >>>>> >>>>> "READ_ONCE(msk->bpf_scheduler_task) == current" >>>>> >>>>> to confirm the correct task, return -EINVAL if it doesn't match. >>>>> >>>>> Also creates helpers for setting, clearing and checking that >>>>> value. > > (...) > >>>>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock >>>>> *msk) >>>>> +{ >>>>> + struct task_struct *task = READ_ONCE(msk- >>>>>> bpf_iter_task); >>>>> + >>>>> + if (task && task == current) >>>>> + return true; >>>>> + return false; >>>>> +} >>>> >>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a >>>> loop, I >>>> found that the test would break in some cases. After debugging, I >>>> found >>>> that "task" and "current" were not equal: >>>> >>>> [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new >>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f >>>> current=0000000024db2987 >>>> >>>> I will try to fix it, but haven't found a solution yet. >>> >>> (sorry for the delay, I need a bit of time to catch up) >>> >>> I talked a bit to Alexei Starovoitov last week. He told me that with >>> the >>> BPF struct_ops, it is possible to tell the verifier that some locks >>> are >>> taken either by some struct_ops types, or even per callbacks of some >>> specific struct_ops (WIP on sched_ext side). It is also possible to >>> get >>> some locks automatically (polymorphism), and there are examples on >>> VFS side. >> >> Thanks for your reminder, I will look at these BPF codes. Our goal is >> to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP >> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops >> are not allowed to use mptcp_subflow bpf_iter. > > Yes, that's correct. I didn't check, but **maybe** some mptcp_sched_ops > and mptcp_pm_ops callback might not be allowed to use bpf_iter. In this > case, it might be needed to allow only some of them to use bpf_iter. > > Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be > done while holding the msk lock, e.g. being called from the worker and > not from a subflow event, etc. but maybe there are some exceptions needed. Hi Matthieu - Is this last paragraph also feedback from Alexei? Can you elaborate a little - is the lock needed for BPF-specific reasons or to avoid having the callback modify msk state? I'll try to look around at the verifier and automatic locking techniques you mentioned above, it would be great to have a better way to handle socket locks w/ BPF. - Mat > >>> In other words, it means we don't need to add this "bpf_iter_task", >>> there are other techniques, but I don't have more details, and I >>> didn't >>> check in the code. If it is not clear for you and you don't find >>> other >>> examples elsewhere (sched_ext? check WIP patches maybe?), then Alexei >>> said we should not hesitate to ask questions on the BPF mailing list. >> >> We can send "Add mptcp_subflow bpf_iter support" v3 to the BPF mailing >> list and ask questions during the reviewing process. > > Indeed. I hope to be able to find a bit of time this week to send the v3. > > Cheers, > Matt > -- > Sponsored by the NGI0 Core fund. > >
Hi Matt, On Mon, 2025-03-17 at 14:57 +0100, Matthieu Baerts wrote: > Hi Geliang, > > On 17/03/2025 11:59, Geliang Tang wrote: > > Hi Matt, > > > > On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote: > > > Hi Geliang, Mat, > > > > > > On 17/03/2025 10:41, Geliang Tang wrote: > > > > On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: > > > > > From: Geliang Tang <tanggeliang@kylinos.cn> > > > > > > > > > > To make sure the mptcp_subflow bpf_iter is running in the > > > > > MPTCP context. This patch adds a simplified version of > > > > > tracking > > > > > for it: > > > > > > > > > > 1. Add a 'struct task_struct *bpf_iter_task' field to struct > > > > > mptcp_sock. > > > > > > > > > > 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before > > > > > calling > > > > > a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) > > > > > after > > > > > the hook returns. > > > > > > > > > > 3. In bpf_iter_mptcp_subflow_new(), check > > > > > > > > > > "READ_ONCE(msk->bpf_scheduler_task) == current" > > > > > > > > > > to confirm the correct task, return -EINVAL if it doesn't > > > > > match. > > > > > > > > > > Also creates helpers for setting, clearing and checking that > > > > > value. > > (...) > > > > > > +static inline bool mptcp_check_bpf_iter_task(struct > > > > > mptcp_sock > > > > > *msk) > > > > > +{ > > > > > + struct task_struct *task = READ_ONCE(msk- > > > > > > bpf_iter_task); > > > > > + > > > > > + if (task && task == current) > > > > > + return true; > > > > > + return false; > > > > > +} > > > > > > > > This v3 has a bug. When I was testing MPTCP BPF selftests in a > > > > loop, I > > > > found that the test would break in some cases. After debugging, > > > > I > > > > found > > > > that "task" and "current" were not equal: > > > > > > > > [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new > > > > msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f > > > > current=0000000024db2987 > > > > > > > > I will try to fix it, but haven't found a solution yet. > > > > > > (sorry for the delay, I need a bit of time to catch up) > > > > > > I talked a bit to Alexei Starovoitov last week. He told me that > > > with > > > the > > > BPF struct_ops, it is possible to tell the verifier that some > > > locks > > > are > > > taken either by some struct_ops types, or even per callbacks of > > > some > > > specific struct_ops (WIP on sched_ext side). It is also possible > > > to > > > get > > > some locks automatically (polymorphism), and there are examples > > > on > > > VFS side. > > > > Thanks for your reminder, I will look at these BPF codes. Our goal > > is > > to make mptcp_subflow bpf_iter only used by struct_ops defined by > > MPTCP > > BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other > > struct_ops > > are not allowed to use mptcp_subflow bpf_iter. I checked and found that other struct_ops **cannot** use mptcp_subflow bpf_iter. Although we registered this bpf_iter for use with BPF_PROG_TYPE_STRUCT_OPS type, we checked in bpf_iter_mptcp_subflow_new that sk->sk_protocol must be IPPROTO_MPTCP. Does this mean that other struct_ops cannot use mptcp_subflow bpf_iter successfully? I don't know if this check is sufficient. > > Yes, that's correct. I didn't check, but **maybe** some > mptcp_sched_ops > and mptcp_pm_ops callback might not be allowed to use bpf_iter. In > this > case, it might be needed to allow only some of them to use bpf_iter. I guess it's not easy to allow only some callbacks of a struct_ops to access certain functions, because The BPF verifier verifies the struct_ops as a whole. But I will continue to look for a solution. > > Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be > done while holding the msk lock, e.g. being called from the worker > and > not from a subflow event, etc. but maybe there are some exceptions > needed. I checked the following 19 callbacks of mptcp_{pm,sched}_ops that have been implemented in my code tree. Except for the three exceptions (get_local_id, get_priority and add_addr_received), the other 17 callbacks are all be done while holding the msk lock: struct mptcp_sched_ops { int (*get_send)(struct mptcp_sock *msk); int (*get_retrans)(struct mptcp_sock *msk); } struct mptcp_pm_ops { /* required */ int (*get_local_id)(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *skc); bool (*get_priority)(struct mptcp_sock *msk, struct mptcp_addr_info *skc); /* optional */ void (*established)(struct mptcp_sock *msk); void (*subflow_established)(struct mptcp_sock *msk); /* required */ bool (*allow_new_subflow)(struct mptcp_sock *msk); bool (*accept_new_subflow)(const struct mptcp_sock *msk); bool (*add_addr_echo)(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); /* optional */ int (*add_addr_received)(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void (*rm_addr_received)(struct mptcp_sock *msk); /* optional */ int (*add_addr)(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry); int (*del_addr)(struct mptcp_sock *msk, const struct mptcp_pm_addr_entry *entry); int (*flush_addrs)(struct mptcp_sock *msk, struct list_head *rm_list); /* optional */ int (*address_announce)(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *local); int (*address_remove)(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *local); int (*subflow_create)(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *local, struct mptcp_addr_info *remote); int (*subflow_destroy)(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *local, struct mptcp_addr_info *remote); /* required */ int (*set_priority)(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *local, struct mptcp_pm_addr_entry *remote); }; For these three exceptions, add_addr_received is invoked in mptcp_pm_add_addr_received, here the socket lock of ssk is already holding. Similarly, in subflow_chk_local_id, get_local_id and get_priority are called, where the socket lock of ssk is already holding too. In addition, get_local_id and get_priority are also called in subflow_token_join_request, which is in atomic and can hold msk lock through bh_lock_sock. If locking is required here, I can send a patch to do this. In this way, all callbacks of mptcp_{pm,sched}_ops can be done while holding the msk lock or the ssk lock. Thanks, -Geliang > > > > In other words, it means we don't need to add this > > > "bpf_iter_task", > > > there are other techniques, but I don't have more details, and I > > > didn't > > > check in the code. If it is not clear for you and you don't find > > > other > > > examples elsewhere (sched_ext? check WIP patches maybe?), then > > > Alexei > > > said we should not hesitate to ask questions on the BPF mailing > > > list. > > > > We can send "Add mptcp_subflow bpf_iter support" v3 to the BPF > > mailing > > list and ask questions during the reviewing process. > > Indeed. I hope to be able to find a bit of time this week to send the > v3. > > Cheers, > Matt
Hi Mat, On 18/03/2025 02:25, Mat Martineau wrote: > On Mon, 17 Mar 2025, Matthieu Baerts wrote: > >> Hi Geliang, >> >> On 17/03/2025 11:59, Geliang Tang wrote: >>> Hi Matt, >>> >>> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote: >>>> Hi Geliang, Mat, >>>> >>>> On 17/03/2025 10:41, Geliang Tang wrote: >>>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: >>>>>> From: Geliang Tang <tanggeliang@kylinos.cn> >>>>>> >>>>>> To make sure the mptcp_subflow bpf_iter is running in the >>>>>> MPTCP context. This patch adds a simplified version of tracking >>>>>> for it: >>>>>> >>>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct >>>>>> mptcp_sock. >>>>>> >>>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling >>>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after >>>>>> the hook returns. >>>>>> >>>>>> 3. In bpf_iter_mptcp_subflow_new(), check >>>>>> >>>>>> "READ_ONCE(msk->bpf_scheduler_task) == current" >>>>>> >>>>>> to confirm the correct task, return -EINVAL if it doesn't match. >>>>>> >>>>>> Also creates helpers for setting, clearing and checking that >>>>>> value. >> >> (...) >> >>>>>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock >>>>>> *msk) >>>>>> +{ >>>>>> + struct task_struct *task = READ_ONCE(msk- >>>>>>> bpf_iter_task); >>>>>> + >>>>>> + if (task && task == current) >>>>>> + return true; >>>>>> + return false; >>>>>> +} >>>>> >>>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a >>>>> loop, I >>>>> found that the test would break in some cases. After debugging, I >>>>> found >>>>> that "task" and "current" were not equal: >>>>> >>>>> [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new >>>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f >>>>> current=0000000024db2987 >>>>> >>>>> I will try to fix it, but haven't found a solution yet. >>>> >>>> (sorry for the delay, I need a bit of time to catch up) >>>> >>>> I talked a bit to Alexei Starovoitov last week. He told me that with >>>> the >>>> BPF struct_ops, it is possible to tell the verifier that some locks >>>> are >>>> taken either by some struct_ops types, or even per callbacks of some >>>> specific struct_ops (WIP on sched_ext side). It is also possible to >>>> get >>>> some locks automatically (polymorphism), and there are examples on >>>> VFS side. >>> >>> Thanks for your reminder, I will look at these BPF codes. Our goal is >>> to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP >>> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops >>> are not allowed to use mptcp_subflow bpf_iter. >> >> Yes, that's correct. I didn't check, but **maybe** some mptcp_sched_ops >> and mptcp_pm_ops callback might not be allowed to use bpf_iter. In this >> case, it might be needed to allow only some of them to use bpf_iter. >> >> Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be >> done while holding the msk lock, e.g. being called from the worker and >> not from a subflow event, etc. but maybe there are some exceptions >> needed. > > Hi Matthieu - > > Is this last paragraph also feedback from Alexei? Can you elaborate a > little - is the lock needed for BPF-specific reasons or to avoid having > the callback modify msk state? No, it is not from Alexei, but from me. Currently, some actions on a subflow will call some code in pm.c that will simply schedule the worker to be called from a different context to do other actions. To convert this to an mptcp_pm_ops, I think we should not have two callbacks -- one to schedule the worker, and one to react from the worker context -- but only one called from the worker context if such callback is set. This should also simplify the BPF PMs if all/most callbacks are done from the same context while owning the msk lock, no? > I'll try to look around at the verifier and automatic locking techniques > you mentioned above, it would be great to have a better way to handle > socket locks w/ BPF. Alexei suggested to look at the VFS code. If I understood correctly, the idea is to have some locks automatically taken for some actions, but without having to dedicate new kfunc, checking if they are already taken, or requiring the BPF code to handle them (auto-releasing them then). But I don't have more details about that. Cheers, Matt
Hi Geliang, On 18/03/2025 11:35, Geliang Tang wrote: > Hi Matt, > > On Mon, 2025-03-17 at 14:57 +0100, Matthieu Baerts wrote: >> Hi Geliang, >> >> On 17/03/2025 11:59, Geliang Tang wrote: >>> Hi Matt, >>> >>> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote: >>>> Hi Geliang, Mat, >>>> >>>> On 17/03/2025 10:41, Geliang Tang wrote: >>>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: >>>>>> From: Geliang Tang <tanggeliang@kylinos.cn> >>>>>> >>>>>> To make sure the mptcp_subflow bpf_iter is running in the >>>>>> MPTCP context. This patch adds a simplified version of >>>>>> tracking >>>>>> for it: >>>>>> >>>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct >>>>>> mptcp_sock. >>>>>> >>>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before >>>>>> calling >>>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) >>>>>> after >>>>>> the hook returns. >>>>>> >>>>>> 3. In bpf_iter_mptcp_subflow_new(), check >>>>>> >>>>>> "READ_ONCE(msk->bpf_scheduler_task) == current" >>>>>> >>>>>> to confirm the correct task, return -EINVAL if it doesn't >>>>>> match. >>>>>> >>>>>> Also creates helpers for setting, clearing and checking that >>>>>> value. >> >> (...) >> >>>>>> +static inline bool mptcp_check_bpf_iter_task(struct >>>>>> mptcp_sock >>>>>> *msk) >>>>>> +{ >>>>>> + struct task_struct *task = READ_ONCE(msk- >>>>>>> bpf_iter_task); >>>>>> + >>>>>> + if (task && task == current) >>>>>> + return true; >>>>>> + return false; >>>>>> +} >>>>> >>>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a >>>>> loop, I >>>>> found that the test would break in some cases. After debugging, >>>>> I >>>>> found >>>>> that "task" and "current" were not equal: >>>>> >>>>> [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new >>>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f >>>>> current=0000000024db2987 >>>>> >>>>> I will try to fix it, but haven't found a solution yet. >>>> >>>> (sorry for the delay, I need a bit of time to catch up) >>>> >>>> I talked a bit to Alexei Starovoitov last week. He told me that >>>> with >>>> the >>>> BPF struct_ops, it is possible to tell the verifier that some >>>> locks >>>> are >>>> taken either by some struct_ops types, or even per callbacks of >>>> some >>>> specific struct_ops (WIP on sched_ext side). It is also possible >>>> to >>>> get >>>> some locks automatically (polymorphism), and there are examples >>>> on >>>> VFS side. >>> >>> Thanks for your reminder, I will look at these BPF codes. Our goal >>> is >>> to make mptcp_subflow bpf_iter only used by struct_ops defined by >>> MPTCP >>> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other >>> struct_ops >>> are not allowed to use mptcp_subflow bpf_iter. > > I checked and found that other struct_ops **cannot** use mptcp_subflow > bpf_iter. Although we registered this bpf_iter for use with > BPF_PROG_TYPE_STRUCT_OPS type, we checked in bpf_iter_mptcp_subflow_new > that sk->sk_protocol must be IPPROTO_MPTCP. Does this mean that other > struct_ops cannot use mptcp_subflow bpf_iter successfully? I don't know > if this check is sufficient. Sorry, I don't know. With the current version, I don't see any links between mptcp_subflow bpf_iter and mptcp_{sched,pm}_ops, then I don't know how the restriction works, right? I guess there might be a restriction because "struct mptcp_sock*" are used in arguments? But I guess that's not enough because such structures can be obtained from different struct_ops. Probably something else is missing to have this link then? >> Yes, that's correct. I didn't check, but **maybe** some >> mptcp_sched_ops >> and mptcp_pm_ops callback might not be allowed to use bpf_iter. In >> this >> case, it might be needed to allow only some of them to use bpf_iter. > > I guess it's not easy to allow only some callbacks of a struct_ops to > access certain functions, because The BPF verifier verifies the > struct_ops as a whole. But I will continue to look for a solution. Alexei told me that this work was in progress for sched_ext, but I don't know more about that, sorry. >> Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be >> done while holding the msk lock, e.g. being called from the worker >> and >> not from a subflow event, etc. but maybe there are some exceptions >> needed. > > I checked the following 19 callbacks of mptcp_{pm,sched}_ops that have > been implemented in my code tree. Except for the three exceptions > (get_local_id, get_priority and add_addr_received), the other 17 > callbacks are all be done while holding the msk lock: Thank you for having checked! > struct mptcp_sched_ops { > int (*get_send)(struct mptcp_sock *msk); > int (*get_retrans)(struct mptcp_sock *msk); > } > > struct mptcp_pm_ops { > /* required */ > int (*get_local_id)(struct mptcp_sock *msk, > struct mptcp_pm_addr_entry *skc); > bool (*get_priority)(struct mptcp_sock *msk, > struct mptcp_addr_info *skc); > > /* optional */ > void (*established)(struct mptcp_sock *msk); > void (*subflow_established)(struct mptcp_sock *msk); > > /* required */ > bool (*allow_new_subflow)(struct mptcp_sock *msk); > bool (*accept_new_subflow)(const struct mptcp_sock *msk); > bool (*add_addr_echo)(struct mptcp_sock *msk, > const struct mptcp_addr_info *addr); > > /* optional */ > int (*add_addr_received)(struct mptcp_sock *msk, > const struct mptcp_addr_info *addr); > void (*rm_addr_received)(struct mptcp_sock *msk); > > /* optional */ > int (*add_addr)(struct mptcp_sock *msk, > struct mptcp_pm_addr_entry *entry); > int (*del_addr)(struct mptcp_sock *msk, > const struct mptcp_pm_addr_entry *entry); > int (*flush_addrs)(struct mptcp_sock *msk, > struct list_head *rm_list); > > /* optional */ > int (*address_announce)(struct mptcp_sock *msk, > struct mptcp_pm_addr_entry *local); > int (*address_remove)(struct mptcp_sock *msk, > struct mptcp_pm_addr_entry *local); > int (*subflow_create)(struct mptcp_sock *msk, > struct mptcp_pm_addr_entry *local, > struct mptcp_addr_info *remote); > int (*subflow_destroy)(struct mptcp_sock *msk, > struct mptcp_pm_addr_entry *local, > struct mptcp_addr_info *remote); > > /* required */ > int (*set_priority)(struct mptcp_sock *msk, > struct mptcp_pm_addr_entry *local, > struct mptcp_pm_addr_entry *remote); Mmh, all 8 callbacks from add_addr to here seem to be linked to Netlink commands, right? If yes, they should not be here: they don't make sense for a BPF PM, no? BPF PMs should be configured with BPF, and not via Netlink. Netlink is only for the built-in PMs (in-kernel and userspace PMs) > }; > > For these three exceptions, add_addr_received is invoked in > mptcp_pm_add_addr_received, here the socket lock of ssk is already > holding. For add_addr_received, I think it is safer to have the callback from the worker context. In other words, when an ADD_ADDR received on a subflow: - the ADD_ADDR echo should be sent: I don't think it is worth it letting the other peer resending it just in case the userspace PM was not "ready" - if pm->ops->add_addr_received is set, schedule the worker and set msk->pm.remote - then pm->ops->add_addr_received will be called from the worker. > Similarly, in subflow_chk_local_id, get_local_id and get_priority are > called, where the socket lock of ssk is already holding too. > > In addition, get_local_id and get_priority are also called in > subflow_token_join_request, which is in atomic and can hold msk lock > through bh_lock_sock. If locking is required here, I can send a patch > to do this. Yes, for the ID and backup, I guess we will need an exception there, and a way not to let the BPF PMs calling bpf_iter. We should not hold the msk lock here. I guess the easier would be to ask the BPF maintainers what we should do here. Maybe this can be done after having sent the mptcp_subflow v3 series, or at the same time. I will see what I can do. > In this way, all callbacks of mptcp_{pm,sched}_ops can be done while > holding the msk lock or the ssk lock. At least on the scheduler side, that will be the case (or maybe not if we change the API to cover more cases :) ). Anyway, probably best to wait for BPF maintainers recommendations instead of guessing. Cheers, Matt
On Tue, 18 Mar 2025, Matthieu Baerts wrote: > Hi Mat, > > On 18/03/2025 02:25, Mat Martineau wrote: >> On Mon, 17 Mar 2025, Matthieu Baerts wrote: >> >>> Hi Geliang, >>> >>> On 17/03/2025 11:59, Geliang Tang wrote: >>>> Hi Matt, >>>> >>>> On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote: >>>>> Hi Geliang, Mat, >>>>> >>>>> On 17/03/2025 10:41, Geliang Tang wrote: >>>>>> On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: >>>>>>> From: Geliang Tang <tanggeliang@kylinos.cn> >>>>>>> >>>>>>> To make sure the mptcp_subflow bpf_iter is running in the >>>>>>> MPTCP context. This patch adds a simplified version of tracking >>>>>>> for it: >>>>>>> >>>>>>> 1. Add a 'struct task_struct *bpf_iter_task' field to struct >>>>>>> mptcp_sock. >>>>>>> >>>>>>> 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before calling >>>>>>> a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, NULL) after >>>>>>> the hook returns. >>>>>>> >>>>>>> 3. In bpf_iter_mptcp_subflow_new(), check >>>>>>> >>>>>>> "READ_ONCE(msk->bpf_scheduler_task) == current" >>>>>>> >>>>>>> to confirm the correct task, return -EINVAL if it doesn't match. >>>>>>> >>>>>>> Also creates helpers for setting, clearing and checking that >>>>>>> value. >>> >>> (...) >>> >>>>>>> +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock >>>>>>> *msk) >>>>>>> +{ >>>>>>> + struct task_struct *task = READ_ONCE(msk- >>>>>>>> bpf_iter_task); >>>>>>> + >>>>>>> + if (task && task == current) >>>>>>> + return true; >>>>>>> + return false; >>>>>>> +} >>>>>> >>>>>> This v3 has a bug. When I was testing MPTCP BPF selftests in a >>>>>> loop, I >>>>>> found that the test would break in some cases. After debugging, I >>>>>> found >>>>>> that "task" and "current" were not equal: >>>>>> >>>>>> [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new >>>>>> msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f >>>>>> current=0000000024db2987 >>>>>> >>>>>> I will try to fix it, but haven't found a solution yet. >>>>> >>>>> (sorry for the delay, I need a bit of time to catch up) >>>>> >>>>> I talked a bit to Alexei Starovoitov last week. He told me that with >>>>> the >>>>> BPF struct_ops, it is possible to tell the verifier that some locks >>>>> are >>>>> taken either by some struct_ops types, or even per callbacks of some >>>>> specific struct_ops (WIP on sched_ext side). It is also possible to >>>>> get >>>>> some locks automatically (polymorphism), and there are examples on >>>>> VFS side. >>>> >>>> Thanks for your reminder, I will look at these BPF codes. Our goal is >>>> to make mptcp_subflow bpf_iter only used by struct_ops defined by MPTCP >>>> BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other struct_ops >>>> are not allowed to use mptcp_subflow bpf_iter. >>> >>> Yes, that's correct. I didn't check, but **maybe** some mptcp_sched_ops >>> and mptcp_pm_ops callback might not be allowed to use bpf_iter. In this >>> case, it might be needed to allow only some of them to use bpf_iter. >>> >>> Note that it sounds like all mptcp_{pm,sched}_ops callbacks should be >>> done while holding the msk lock, e.g. being called from the worker and >>> not from a subflow event, etc. but maybe there are some exceptions >>> needed. >> >> Hi Matthieu - >> >> Is this last paragraph also feedback from Alexei? Can you elaborate a >> little - is the lock needed for BPF-specific reasons or to avoid having >> the callback modify msk state? > > No, it is not from Alexei, but from me. Currently, some actions on a > subflow will call some code in pm.c that will simply schedule the worker > to be called from a different context to do other actions. To convert > this to an mptcp_pm_ops, I think we should not have two callbacks -- one > to schedule the worker, and one to react from the worker context -- but > only one called from the worker context if such callback is set. This > should also simplify the BPF PMs if all/most callbacks are done from the > same context while owning the msk lock, no? > Ok, thanks for the details. I agree, for the PM callbacks it makes sense to only have a worker-context callback. - Mat >> I'll try to look around at the verifier and automatic locking techniques >> you mentioned above, it would be great to have a better way to handle >> socket locks w/ BPF. > > Alexei suggested to look at the VFS code. If I understood correctly, the > idea is to have some locks automatically taken for some actions, but > without having to dedicate new kfunc, checking if they are already > taken, or requiring the BPF code to handle them (auto-releasing them > then). But I don't have more details about that. >
Hi Matt, On Tue, 2025-03-18 at 12:26 +0100, Matthieu Baerts wrote: > Hi Geliang, > > On 18/03/2025 11:35, Geliang Tang wrote: > > Hi Matt, > > > > On Mon, 2025-03-17 at 14:57 +0100, Matthieu Baerts wrote: > > > Hi Geliang, > > > > > > On 17/03/2025 11:59, Geliang Tang wrote: > > > > Hi Matt, > > > > > > > > On Mon, 2025-03-17 at 11:29 +0100, Matthieu Baerts wrote: > > > > > Hi Geliang, Mat, > > > > > > > > > > On 17/03/2025 10:41, Geliang Tang wrote: > > > > > > On Mon, 2025-03-10 at 11:30 +0800, Geliang Tang wrote: > > > > > > > From: Geliang Tang <tanggeliang@kylinos.cn> > > > > > > > > > > > > > > To make sure the mptcp_subflow bpf_iter is running in the > > > > > > > MPTCP context. This patch adds a simplified version of > > > > > > > tracking > > > > > > > for it: > > > > > > > > > > > > > > 1. Add a 'struct task_struct *bpf_iter_task' field to > > > > > > > struct > > > > > > > mptcp_sock. > > > > > > > > > > > > > > 2. Do a WRITE_ONCE(msk->bpf_iter_task, current) before > > > > > > > calling > > > > > > > a MPTCP BPF hook, and WRITE_ONCE(msk->bpf_iter_task, > > > > > > > NULL) > > > > > > > after > > > > > > > the hook returns. > > > > > > > > > > > > > > 3. In bpf_iter_mptcp_subflow_new(), check > > > > > > > > > > > > > > "READ_ONCE(msk->bpf_scheduler_task) == current" > > > > > > > > > > > > > > to confirm the correct task, return -EINVAL if it doesn't > > > > > > > match. > > > > > > > > > > > > > > Also creates helpers for setting, clearing and checking > > > > > > > that > > > > > > > value. > > > > > > (...) > > > > > > > > > > +static inline bool mptcp_check_bpf_iter_task(struct > > > > > > > mptcp_sock > > > > > > > *msk) > > > > > > > +{ > > > > > > > + struct task_struct *task = READ_ONCE(msk- > > > > > > > > bpf_iter_task); > > > > > > > + > > > > > > > + if (task && task == current) > > > > > > > + return true; > > > > > > > + return false; > > > > > > > +} > > > > > > > > > > > > This v3 has a bug. When I was testing MPTCP BPF selftests > > > > > > in a > > > > > > loop, I > > > > > > found that the test would break in some cases. After > > > > > > debugging, > > > > > > I > > > > > > found > > > > > > that "task" and "current" were not equal: > > > > > > > > > > > > [ 520.209749][T11984] MPTCP: bpf_iter_mptcp_subflow_new > > > > > > msk=00000000fc8f7370 in_interrupt=0 task=00000000ef28139f > > > > > > current=0000000024db2987 > > > > > > > > > > > > I will try to fix it, but haven't found a solution yet. > > > > > > > > > > (sorry for the delay, I need a bit of time to catch up) > > > > > > > > > > I talked a bit to Alexei Starovoitov last week. He told me > > > > > that > > > > > with > > > > > the > > > > > BPF struct_ops, it is possible to tell the verifier that some > > > > > locks > > > > > are > > > > > taken either by some struct_ops types, or even per callbacks > > > > > of > > > > > some > > > > > specific struct_ops (WIP on sched_ext side). It is also > > > > > possible > > > > > to > > > > > get > > > > > some locks automatically (polymorphism), and there are > > > > > examples > > > > > on > > > > > VFS side. > > > > > > > > Thanks for your reminder, I will look at these BPF codes. Our > > > > goal > > > > is > > > > to make mptcp_subflow bpf_iter only used by struct_ops defined > > > > by > > > > MPTCP > > > > BPF (bpf_mptcp_sched_ops and bpf_mptcp_pm_ops), right? Other > > > > struct_ops > > > > are not allowed to use mptcp_subflow bpf_iter. > > > > I checked and found that other struct_ops **cannot** use > > mptcp_subflow > > bpf_iter. Although we registered this bpf_iter for use with > > BPF_PROG_TYPE_STRUCT_OPS type, we checked in > > bpf_iter_mptcp_subflow_new > > that sk->sk_protocol must be IPPROTO_MPTCP. Does this mean that > > other > > struct_ops cannot use mptcp_subflow bpf_iter successfully? I don't > > know > > if this check is sufficient. > > Sorry, I don't know. With the current version, I don't see any links > between mptcp_subflow bpf_iter and mptcp_{sched,pm}_ops, then I don't > know how the restriction works, right? I guess there might be a > restriction because "struct mptcp_sock*" are used in arguments? But I > guess that's not enough because such structures can be obtained from > different struct_ops. Probably something else is missing to have this > link then? > > > > Yes, that's correct. I didn't check, but **maybe** some > > > mptcp_sched_ops > > > and mptcp_pm_ops callback might not be allowed to use bpf_iter. > > > In > > > this > > > case, it might be needed to allow only some of them to use > > > bpf_iter. > > > > I guess it's not easy to allow only some callbacks of a struct_ops > > to > > access certain functions, because The BPF verifier verifies the > > struct_ops as a whole. But I will continue to look for a solution. > > Alexei told me that this work was in progress for sched_ext, but I > don't > know more about that, sorry. > > > > Note that it sounds like all mptcp_{pm,sched}_ops callbacks > > > should be > > > done while holding the msk lock, e.g. being called from the > > > worker > > > and > > > not from a subflow event, etc. but maybe there are some > > > exceptions > > > needed. > > > > I checked the following 19 callbacks of mptcp_{pm,sched}_ops that > > have > > been implemented in my code tree. Except for the three exceptions > > (get_local_id, get_priority and add_addr_received), the other 17 > > callbacks are all be done while holding the msk lock: > > Thank you for having checked! > > > struct mptcp_sched_ops { > > int (*get_send)(struct mptcp_sock *msk); > > int (*get_retrans)(struct mptcp_sock *msk); > > } > > > > struct mptcp_pm_ops { > > /* required */ > > int (*get_local_id)(struct mptcp_sock *msk, > > struct mptcp_pm_addr_entry *skc); > > bool (*get_priority)(struct mptcp_sock *msk, > > struct mptcp_addr_info *skc); > > > > /* optional */ > > void (*established)(struct mptcp_sock *msk); > > void (*subflow_established)(struct mptcp_sock *msk); > > > > /* required */ > > bool (*allow_new_subflow)(struct mptcp_sock *msk); > > bool (*accept_new_subflow)(const struct mptcp_sock *msk); > > bool (*add_addr_echo)(struct mptcp_sock *msk, > > const struct mptcp_addr_info *addr); > > > > /* optional */ > > int (*add_addr_received)(struct mptcp_sock *msk, > > const struct mptcp_addr_info > > *addr); > > void (*rm_addr_received)(struct mptcp_sock *msk); > > > > /* optional */ > > int (*add_addr)(struct mptcp_sock *msk, > > struct mptcp_pm_addr_entry *entry); > > int (*del_addr)(struct mptcp_sock *msk, > > const struct mptcp_pm_addr_entry *entry); > > int (*flush_addrs)(struct mptcp_sock *msk, > > struct list_head *rm_list); > > > > /* optional */ > > int (*address_announce)(struct mptcp_sock *msk, > > struct mptcp_pm_addr_entry *local); > > int (*address_remove)(struct mptcp_sock *msk, > > struct mptcp_pm_addr_entry *local); > > int (*subflow_create)(struct mptcp_sock *msk, > > struct mptcp_pm_addr_entry *local, > > struct mptcp_addr_info *remote); > > int (*subflow_destroy)(struct mptcp_sock *msk, > > struct mptcp_pm_addr_entry *local, > > struct mptcp_addr_info *remote); > > > > /* required */ > > int (*set_priority)(struct mptcp_sock *msk, > > struct mptcp_pm_addr_entry *local, > > struct mptcp_pm_addr_entry *remote); > > Mmh, all 8 callbacks from add_addr to here seem to be linked to > Netlink > commands, right? If yes, they should not be here: they don't make > sense > for a BPF PM, no? BPF PMs should be configured with BPF, and not via Do you have any idea how we can pass commands like ADD_ADDR and addresses from user space to BPF? What I can think of is passing through sockopt, the user space passes the command and the address through setsockopt, and the BPF program handles it through the custom "cgroup/setsockopt". I would like to hear your opinions. If you also agree to use sockopt to implement it, I can write a test program for this. Thanks, -Geliang > Netlink. Netlink is only for the built-in PMs (in-kernel and > userspace PMs) > > > }; > > > > For these three exceptions, add_addr_received is invoked in > > mptcp_pm_add_addr_received, here the socket lock of ssk is already > > holding. > > For add_addr_received, I think it is safer to have the callback from > the > worker context. In other words, when an ADD_ADDR received on a > subflow: > > - the ADD_ADDR echo should be sent: I don't think it is worth it > letting > the other peer resending it just in case the userspace PM was not > "ready" > > - if pm->ops->add_addr_received is set, schedule the worker and set > msk->pm.remote > > - then pm->ops->add_addr_received will be called from the worker. > > > Similarly, in subflow_chk_local_id, get_local_id and get_priority > > are > > called, where the socket lock of ssk is already holding too. > > > > In addition, get_local_id and get_priority are also called in > > subflow_token_join_request, which is in atomic and can hold msk > > lock > > through bh_lock_sock. If locking is required here, I can send a > > patch > > to do this. > > Yes, for the ID and backup, I guess we will need an exception there, > and > a way not to let the BPF PMs calling bpf_iter. We should not hold the > msk lock here. > > I guess the easier would be to ask the BPF maintainers what we should > do > here. Maybe this can be done after having sent the mptcp_subflow v3 > series, or at the same time. I will see what I can do. > > > In this way, all callbacks of mptcp_{pm,sched}_ops can be done > > while > > holding the msk lock or the ssk lock. > > At least on the scheduler side, that will be the case (or maybe not > if > we change the API to cover more cases :) ). > > Anyway, probably best to wait for BPF maintainers recommendations > instead of guessing. > > Cheers, > Matt
Hi Geliang, On 21/03/2025 05:14, Geliang Tang wrote: > On Tue, 2025-03-18 at 12:26 +0100, Matthieu Baerts wrote: (...) >> Mmh, all 8 callbacks from add_addr to here seem to be linked to >> Netlink >> commands, right? If yes, they should not be here: they don't make >> sense >> for a BPF PM, no? BPF PMs should be configured with BPF, and not via > > Do you have any idea how we can pass commands like ADD_ADDR and > addresses from user space to BPF? The userspace can inject any kind of BPF program. Additional addresses can then be hardcoded in this program, or it can load them from eBPF maps, or read skel->bss->(...), etc. Netlink is then not needed. > What I can think of is passing through sockopt, the user space passes > the command and the address through setsockopt, and the BPF program > handles it through the custom "cgroup/setsockopt". I would like to hear > your opinions. If you also agree to use sockopt to implement it, I can > write a test program for this. That's a good idea for userspace apps which want to control how the MPTCP PM is acting. I guess MPTCP BPF PM will generally not be linked to network apps: a separate application dedicated to inject and configure the BPF PM, no? Cheers, Matt
Hi Matt, On Fri, 2025-03-21 at 10:19 +0100, Matthieu Baerts wrote: > Hi Geliang, > > On 21/03/2025 05:14, Geliang Tang wrote: > > On Tue, 2025-03-18 at 12:26 +0100, Matthieu Baerts wrote: > > (...) > > > > Mmh, all 8 callbacks from add_addr to here seem to be linked to > > > Netlink > > > commands, right? If yes, they should not be here: they don't make > > > sense > > > for a BPF PM, no? BPF PMs should be configured with BPF, and not > > > via > > > > Do you have any idea how we can pass commands like ADD_ADDR and > > addresses from user space to BPF? > > The userspace can inject any kind of BPF program. Additional > addresses > can then be hardcoded in this program, or it can load them from eBPF > maps, or read skel->bss->(...), etc. > > Netlink is then not needed. > > > What I can think of is passing through sockopt, the user space > > passes > > the command and the address through setsockopt, and the BPF program > > handles it through the custom "cgroup/setsockopt". I would like to > > hear > > your opinions. If you also agree to use sockopt to implement it, I > > can > > write a test program for this. I just tested it and found it difficult to use setsockopt. "cgroup/setsockopt" program doesn't work. We need to invoke sleepable kfuncs in BPF PM: BTF_ID_FLAGS(func, mptcp_pm_remove_addr_entry, KF_SLEEPABLE) BTF_ID_FLAGS(func, mptcp_pm_addr_send_ack, KF_SLEEPABLE) BTF_ID_FLAGS(func, mptcp_pm_mp_prio_send_ack, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_mptcp_subflow_connect, KF_SLEEPABLE) BTF_ID_FLAGS(func, mptcp_subflow_shutdown, KF_SLEEPABLE) BTF_ID_FLAGS(func, mptcp_close_ssk, KF_SLEEPABLE) They can't be invoked in "cgroup/setsockopt" program. And "cgroup/setsockopt" program can't be set with BPF_F_SLEEPABLE flag: err = err ?: bpf_program__set_flags(skel->progs.pm_setsockopt, BPF_F_SLEEPABLE); With this error occurs: test_bpf_hashmap_pm:PASS:set sleepable flags 0 nsec libbpf: prog 'pm_setsockopt': BPF program load failed: -EINVAL libbpf: prog 'pm_setsockopt': -- BEGIN PROG LOAD LOG -- Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 It seems that we can only choose one of fentry/fexit/fmod_ret, lsm, iter, uprobe, or struct_ops type to implement BPF PM. Thanks, -Geliang > That's a good idea for userspace apps which want to control how the > MPTCP PM is acting. I guess MPTCP BPF PM will generally not be linked > to > network apps: a separate application dedicated to inject and > configure > the BPF PM, no? > > Cheers, > Matt
On 21/03/2025 11:05, Geliang Tang wrote: > Hi Matt, > > On Fri, 2025-03-21 at 10:19 +0100, Matthieu Baerts wrote: >> Hi Geliang, >> >> On 21/03/2025 05:14, Geliang Tang wrote: >>> On Tue, 2025-03-18 at 12:26 +0100, Matthieu Baerts wrote: >> >> (...) >> >>>> Mmh, all 8 callbacks from add_addr to here seem to be linked to >>>> Netlink >>>> commands, right? If yes, they should not be here: they don't make >>>> sense >>>> for a BPF PM, no? BPF PMs should be configured with BPF, and not >>>> via >>> >>> Do you have any idea how we can pass commands like ADD_ADDR and >>> addresses from user space to BPF? >> >> The userspace can inject any kind of BPF program. Additional >> addresses >> can then be hardcoded in this program, or it can load them from eBPF >> maps, or read skel->bss->(...), etc. >> >> Netlink is then not needed. >> >>> What I can think of is passing through sockopt, the user space >>> passes >>> the command and the address through setsockopt, and the BPF program >>> handles it through the custom "cgroup/setsockopt". I would like to >>> hear >>> your opinions. If you also agree to use sockopt to implement it, I >>> can >>> write a test program for this. > > I just tested it and found it difficult to use setsockopt. > > "cgroup/setsockopt" program doesn't work. We need to invoke sleepable > kfuncs in BPF PM: (...) > It seems that we can only choose one of fentry/fexit/fmod_ret, lsm, > iter, uprobe, or struct_ops type to implement BPF PM. Thank you for having checked. I think we can look at that aspect later. I don't think many apps will want this kind of control. They can always set something in a map, and the BPF program can look at it when needed. Cheers, Matt
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c index c0da9ac077e4..0a78604742c7 100644 --- a/net/mptcp/bpf.c +++ b/net/mptcp/bpf.c @@ -261,6 +261,8 @@ bpf_iter_mptcp_subflow_new(struct bpf_iter_mptcp_subflow *it, return -EINVAL; msk = mptcp_sk(sk); + if (!mptcp_check_bpf_iter_task(msk)) + return -EINVAL; msk_owned_by_me(msk); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 01157ad2e2dc..d98e48ce8cd8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2729,6 +2729,7 @@ static void __mptcp_init_sock(struct sock *sk) inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); + mptcp_clear_bpf_iter_task(msk); msk->recovery = false; msk->subflow_id = 1; msk->last_data_sent = tcp_jiffies32; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3492b256ecba..1c6958d64291 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -334,6 +334,7 @@ struct mptcp_sock { */ struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; + struct task_struct *bpf_iter_task; struct { u32 space; /* bytes copied in last measurement window */ u32 copied; /* bytes copied in this measurement window */ @@ -1291,4 +1292,23 @@ mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_re static inline void mptcp_join_cookie_init(void) {} #endif +static inline void mptcp_set_bpf_iter_task(struct mptcp_sock *msk) +{ + WRITE_ONCE(msk->bpf_iter_task, current); +} + +static inline void mptcp_clear_bpf_iter_task(struct mptcp_sock *msk) +{ + WRITE_ONCE(msk->bpf_iter_task, NULL); +} + +static inline bool mptcp_check_bpf_iter_task(struct mptcp_sock *msk) +{ + struct task_struct *task = READ_ONCE(msk->bpf_iter_task); + + if (task && task == current) + return true; + return false; +} + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index f09f7eb1d63f..161398f8960c 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -155,6 +155,7 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; + int ret; msk_owned_by_me(msk); @@ -176,12 +177,16 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_send(msk); - return msk->sched->get_send(msk); + mptcp_set_bpf_iter_task(msk); + ret = msk->sched->get_send(msk); + mptcp_clear_bpf_iter_task(msk); + return ret; } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; + int ret; msk_owned_by_me(msk); @@ -196,7 +201,9 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_retrans(msk); - if (msk->sched->get_retrans) - return msk->sched->get_retrans(msk); - return msk->sched->get_send(msk); + mptcp_set_bpf_iter_task(msk); + ret = msk->sched->get_retrans ? msk->sched->get_retrans(msk) : + msk->sched->get_send(msk); + mptcp_clear_bpf_iter_task(msk); + return ret; }