Message ID | 20241025-mptcp-pm-lookup_addr_rcu-v2-2-1478f6c4b205@kernel.org (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | mptcp: pm: use _rcu variant under rcu_read_lock | expand |
Context | Check | Description |
---|---|---|
matttbe/build | success | Build and static analysis OK |
matttbe/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 97 lines checked |
matttbe/shellcheck | success | MPTCP selftests files have not been modified |
matttbe/KVM_Validation__normal | success | Success! ✅ |
matttbe/KVM_Validation__debug | success | Success! ✅ |
matttbe/KVM_Validation__btf-normal__only_bpftest_all_ | success | Success! ✅ |
matttbe/KVM_Validation__btf-debug__only_bpftest_all_ | success | Success! ✅ |
Hi Matt, Thanks for this patch. On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) wrote: > In a few places -- to get an endpoint, dump all of them, and change > their flags -- the list is iterated while holding the pernet->lock, > but > only to read the content of the list. In these cases, we can replace > the > spin locks, by RCU read ones, and use the _rcu variants to iterate > over > the entries list in a lockless way. > > To make it clear, the lookup helpers using the _rcu variant are > renamed > with a _rcu suffix. The previous __lookup_addr() helper can then be > removed, but __lookup_addr_by_id() is still needed. > > While at it, the IDs bitmap is copied before iterating the list to > dump > the different addresses, to avoid any consistencies. > > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> > --- > Notes: > - This is not a fix, a small improvement for -next. > --- > net/mptcp/pm_netlink.c | 36 +++++++++++++++++++----------------- > 1 file changed, 19 insertions(+), 17 deletions(-) > > diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c > index > a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e > 6965731542871 100644 > --- a/net/mptcp/pm_netlink.c > +++ b/net/mptcp/pm_netlink.c > @@ -520,12 +520,12 @@ __lookup_addr_by_id(struct pm_nl_pernet > *pernet, unsigned int id) > } > > static struct mptcp_pm_addr_entry * > -__lookup_addr(struct pm_nl_pernet *pernet, const struct > mptcp_addr_info *info) > +__lookup_addr_by_id_rcu(struct pm_nl_pernet *pernet, unsigned int > id) > { > struct mptcp_pm_addr_entry *entry; > > - list_for_each_entry(entry, &pernet->local_addr_list, list) { > - if (mptcp_addresses_equal(&entry->addr, info, entry- > >addr.port)) > + list_for_each_entry_rcu(entry, &pernet->local_addr_list, > list) { > + if (entry->addr.id == id) > return entry; > } > return NULL; > @@ -1836,8 +1836,8 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb, > struct genl_info *info) > goto fail; > } > > - spin_lock_bh(&pernet->lock); > - entry = __lookup_addr_by_id(pernet, addr.addr.id); > + rcu_read_lock(); > + entry = __lookup_addr_by_id_rcu(pernet, addr.addr.id); > if (!entry) { > GENL_SET_ERR_MSG(info, "address not found"); > ret = -EINVAL; > @@ -1850,11 +1850,11 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb, > struct genl_info *info) > > genlmsg_end(msg, reply); > ret = genlmsg_reply(msg, info); > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > return ret; > > unlock_fail: > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > > fail: > nlmsg_free(msg); > @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff > *msg, > struct net *net = sock_net(msg->sk); > struct mptcp_pm_addr_entry *entry; > struct pm_nl_pernet *pernet; > + unsigned long id_bitmap[4]; > int id = cb->args[0]; > void *hdr; > int i; > > pernet = pm_nl_get_pernet(net); > + bitmap_copy(id_bitmap, pernet->id_bitmap, > MPTCP_PM_MAX_ADDR_ID + 1); I think the id bitmap should only be copied when id is 0: if (!id) bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); Since this mptcp_pm_nl_dump_addr() may be called repeatedly when the dump information is very long. We only copy it the first time it is called, and subsequent calls cannot copy it again. WDYT? Thanks, -Geliang > > - spin_lock_bh(&pernet->lock); > + rcu_read_lock(); > for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { > - if (test_bit(i, pernet->id_bitmap)) { > - entry = __lookup_addr_by_id(pernet, i); > + if (test_bit(i, id_bitmap)) { > + entry = __lookup_addr_by_id_rcu(pernet, i); > if (!entry) > break; > > @@ -1903,7 +1905,7 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, > genlmsg_end(msg, hdr); > } > } > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > > cb->args[0] = id; > return msg->len; > @@ -2060,17 +2062,17 @@ int mptcp_pm_nl_set_flags(struct sk_buff > *skb, struct genl_info *info) > if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) > bkup = 1; > > - spin_lock_bh(&pernet->lock); > - entry = lookup_by_id ? __lookup_addr_by_id(pernet, > addr.addr.id) : > - __lookup_addr(pernet, &addr.addr); > + rcu_read_lock(); > + entry = lookup_by_id ? __lookup_addr_by_id_rcu(pernet, > addr.addr.id) : > + __lookup_addr_rcu(pernet, > &addr.addr); > if (!entry) { > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > GENL_SET_ERR_MSG(info, "address not found"); > return -EINVAL; > } > if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && > (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > GENL_SET_ERR_MSG(info, "invalid addr flags"); > return -EINVAL; > } > @@ -2078,7 +2080,7 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, > struct genl_info *info) > changed = (addr.flags ^ entry->flags) & mask; > entry->flags = (entry->flags & ~mask) | (addr.flags & mask); > addr = *entry; > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > > mptcp_nl_set_flags(net, &addr.addr, bkup, changed); > return 0; >
On 25/10/2024 11:32, Matthieu Baerts (NGI0) wrote: > In a few places -- to get an endpoint, dump all of them, and change > their flags -- the list is iterated while holding the pernet->lock, but > only to read the content of the list. In these cases, we can replace the > spin locks, by RCU read ones, and use the _rcu variants to iterate over > the entries list in a lockless way. > > To make it clear, the lookup helpers using the _rcu variant are renamed > with a _rcu suffix. The previous __lookup_addr() helper can then be > removed, but __lookup_addr_by_id() is still needed. > > While at it, the IDs bitmap is copied before iterating the list to dump > the different addresses, to avoid any consistencies. > > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> > --- > Notes: > - This is not a fix, a small improvement for -next. > --- > net/mptcp/pm_netlink.c | 36 +++++++++++++++++++----------------- > 1 file changed, 19 insertions(+), 17 deletions(-) > > diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c > index a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e6965731542871 100644 > --- a/net/mptcp/pm_netlink.c > +++ b/net/mptcp/pm_netlink.c > @@ -520,12 +520,12 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) > } > > static struct mptcp_pm_addr_entry * > -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) > +__lookup_addr_by_id_rcu(struct pm_nl_pernet *pernet, unsigned int id) > { > struct mptcp_pm_addr_entry *entry; > > - list_for_each_entry(entry, &pernet->local_addr_list, list) { > - if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) > + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { I didn't see the following in the RCU doc -- but it is quite big, so I probably missed it -- but I suppose we don't need to keep the two helpers here with and without the _rcu variant (__lookup_addr_by_id()) if here we add an extra lockdep condition: list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, lockdep_is_held(&pernet->lock)) { WDYT? > + if (entry->addr.id == id) > return entry; > } > return NULL; (...) > @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, > struct net *net = sock_net(msg->sk); > struct mptcp_pm_addr_entry *entry; > struct pm_nl_pernet *pernet; > + unsigned long id_bitmap[4]; Oops, I left my temp declaration, it should be the following line, which will result in the same thing anyway on x86_64, no need to re-run the tests → I can fix that when applying the patches, or in a future version: DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); Cheers, Matt
Hi Geliang, Thank you for the review! On 25/10/2024 16:25, Geliang Tang wrote: > On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) wrote: >> In a few places -- to get an endpoint, dump all of them, and change >> their flags -- the list is iterated while holding the pernet->lock, >> but >> only to read the content of the list. In these cases, we can replace >> the >> spin locks, by RCU read ones, and use the _rcu variants to iterate >> over >> the entries list in a lockless way. >> >> To make it clear, the lookup helpers using the _rcu variant are >> renamed >> with a _rcu suffix. The previous __lookup_addr() helper can then be >> removed, but __lookup_addr_by_id() is still needed. >> >> While at it, the IDs bitmap is copied before iterating the list to >> dump >> the different addresses, to avoid any consistencies. >> >> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> >> --- >> Notes: >> - This is not a fix, a small improvement for -next. >> --- >> net/mptcp/pm_netlink.c | 36 +++++++++++++++++++----------------- >> 1 file changed, 19 insertions(+), 17 deletions(-) >> >> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c >> index >> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e >> 6965731542871 100644 >> --- a/net/mptcp/pm_netlink.c >> +++ b/net/mptcp/pm_netlink.c (...) >> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff >> *msg, >> struct net *net = sock_net(msg->sk); >> struct mptcp_pm_addr_entry *entry; >> struct pm_nl_pernet *pernet; >> + unsigned long id_bitmap[4]; >> int id = cb->args[0]; >> void *hdr; >> int i; >> >> pernet = pm_nl_get_pernet(net); >> + bitmap_copy(id_bitmap, pernet->id_bitmap, >> MPTCP_PM_MAX_ADDR_ID + 1); > > I think the id bitmap should only be copied when id is 0: > > if (!id) > bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); > > Since this mptcp_pm_nl_dump_addr() may be called repeatedly when the > dump information is very long. We only copy it the first time it is > called, and subsequent calls cannot copy it again. WDYT? Sorry, I'm not sure to understand. Here, I did a local copy of 'id_bitmap' just to keep a certain consistency if the bitmap is changed during the RCU read section below: not to have this bitmap being changed during the for loop here below. Before, we had this protection because pernet->lock was held. Are you suggesting here to keep a copy in the cb, not a local one, to do the copy only once? Are you sure it is worth it (and safe)? >> >> - spin_lock_bh(&pernet->lock); >> + rcu_read_lock(); >> for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { >> - if (test_bit(i, pernet->id_bitmap)) { >> - entry = __lookup_addr_by_id(pernet, i); >> + if (test_bit(i, id_bitmap)) { >> + entry = __lookup_addr_by_id_rcu(pernet, i); >> if (!entry) >> break; >> Cheers, Matt
Hi Matt, On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote: > Hi Geliang, > > Thank you for the review! > > On 25/10/2024 16:25, Geliang Tang wrote: > > On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) wrote: > > > In a few places -- to get an endpoint, dump all of them, and > > > change > > > their flags -- the list is iterated while holding the pernet- > > > >lock, > > > but > > > only to read the content of the list. In these cases, we can > > > replace > > > the > > > spin locks, by RCU read ones, and use the _rcu variants to > > > iterate > > > over > > > the entries list in a lockless way. > > > > > > To make it clear, the lookup helpers using the _rcu variant are > > > renamed > > > with a _rcu suffix. The previous __lookup_addr() helper can then > > > be > > > removed, but __lookup_addr_by_id() is still needed. > > > > > > While at it, the IDs bitmap is copied before iterating the list > > > to > > > dump > > > the different addresses, to avoid any consistencies. > > > > > > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> > > > --- > > > Notes: > > > - This is not a fix, a small improvement for -next. > > > --- > > > net/mptcp/pm_netlink.c | 36 +++++++++++++++++++----------------- > > > 1 file changed, 19 insertions(+), 17 deletions(-) > > > > > > diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c > > > index > > > a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8 > > > b50e > > > 6965731542871 100644 > > > --- a/net/mptcp/pm_netlink.c > > > +++ b/net/mptcp/pm_netlink.c > > (...) > > > > @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff > > > *msg, > > > struct net *net = sock_net(msg->sk); > > > struct mptcp_pm_addr_entry *entry; > > > struct pm_nl_pernet *pernet; > > > + unsigned long id_bitmap[4]; > > > int id = cb->args[0]; > > > void *hdr; > > > int i; > > > > > > pernet = pm_nl_get_pernet(net); > > > + bitmap_copy(id_bitmap, pernet->id_bitmap, > > > MPTCP_PM_MAX_ADDR_ID + 1); > > > > I think the id bitmap should only be copied when id is 0: > > > > if (!id) > > bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + > > 1); > > > > Since this mptcp_pm_nl_dump_addr() may be called repeatedly when > > the > > dump information is very long. We only copy it the first time it is A correction is needed here. Regardless of whether the dump information is large or small, the dumpit function will be called repeatedly when dumpit returns non-zero. The loop stops when dumpit returns 0. > > called, and subsequent calls cannot copy it again. WDYT? > > Sorry, I'm not sure to understand. Here, I did a local copy of > 'id_bitmap' just to keep a certain consistency if the bitmap is > changed > during the RCU read section below: not to have this bitmap being > changed > during the for loop here below. Before, we had this protection > because > pernet->lock was held. It is precisely because we need to maintain this consistency that we need to copy the bitmap only once. If we copy the bitmap when dumpit is called a second time, the bitmap obtained at this time is a new bitmap, which destroys this consistency, right? > > Are you suggesting here to keep a copy in the cb, not a local one, to > do > the copy only once? Are you sure it is worth it (and safe)? Because dumpit will be called repeatedly, we need to ensure that the same bitmap is accessed each time, so we cannot use a local one, but need to keep a copy in the cb just like what we do in mptcp_userspace_pm_dump_addr. In addition, when doing bitmap_copy, we need to hold pernet->lock: bitmap = (struct mptcp_id_bitmap *)cb->ctx; if (!id) { spin_lock_bh(&pernet->lock); bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); } Also, we can put this bitmap copy code into a separate patch, which is applied before this one. In this patch we only need to replace spin_lock_bh with rcu_read_lock. WDYT? Thanks, -Geliang > > > > > > > - spin_lock_bh(&pernet->lock); > > > + rcu_read_lock(); > > > for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { > > > - if (test_bit(i, pernet->id_bitmap)) { > > > - entry = __lookup_addr_by_id(pernet, i); > > > + if (test_bit(i, id_bitmap)) { > > > + entry = __lookup_addr_by_id_rcu(pernet, > > > i); > > > if (!entry) > > > break; > > > > Cheers, > Matt
Hi Geliang, On 28/10/2024 03:08, Geliang Tang wrote: > Hi Matt, > > On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote: >> Hi Geliang, >> >> Thank you for the review! >> >> On 25/10/2024 16:25, Geliang Tang wrote: >>> On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) wrote: >>>> In a few places -- to get an endpoint, dump all of them, and >>>> change >>>> their flags -- the list is iterated while holding the pernet- >>>>> lock, >>>> but >>>> only to read the content of the list. In these cases, we can >>>> replace >>>> the >>>> spin locks, by RCU read ones, and use the _rcu variants to >>>> iterate >>>> over >>>> the entries list in a lockless way. >>>> >>>> To make it clear, the lookup helpers using the _rcu variant are >>>> renamed >>>> with a _rcu suffix. The previous __lookup_addr() helper can then >>>> be >>>> removed, but __lookup_addr_by_id() is still needed. >>>> >>>> While at it, the IDs bitmap is copied before iterating the list >>>> to >>>> dump >>>> the different addresses, to avoid any consistencies. >>>> >>>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> >>>> --- >>>> Notes: >>>> - This is not a fix, a small improvement for -next. >>>> --- >>>> net/mptcp/pm_netlink.c | 36 +++++++++++++++++++----------------- >>>> 1 file changed, 19 insertions(+), 17 deletions(-) >>>> >>>> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c >>>> index >>>> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8 >>>> b50e >>>> 6965731542871 100644 >>>> --- a/net/mptcp/pm_netlink.c >>>> +++ b/net/mptcp/pm_netlink.c >> >> (...) >> >>>> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff >>>> *msg, >>>> struct net *net = sock_net(msg->sk); >>>> struct mptcp_pm_addr_entry *entry; >>>> struct pm_nl_pernet *pernet; >>>> + unsigned long id_bitmap[4]; >>>> int id = cb->args[0]; >>>> void *hdr; >>>> int i; >>>> >>>> pernet = pm_nl_get_pernet(net); >>>> + bitmap_copy(id_bitmap, pernet->id_bitmap, >>>> MPTCP_PM_MAX_ADDR_ID + 1); >>> >>> I think the id bitmap should only be copied when id is 0: >>> >>> if (!id) >>> bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + >>> 1); >>> >>> Since this mptcp_pm_nl_dump_addr() may be called repeatedly when >>> the >>> dump information is very long. We only copy it the first time it is > > A correction is needed here. Regardless of whether the dump information > is large or small, the dumpit function will be called repeatedly when > dumpit returns non-zero. The loop stops when dumpit returns 0. > >>> called, and subsequent calls cannot copy it again. WDYT? >> >> Sorry, I'm not sure to understand. Here, I did a local copy of >> 'id_bitmap' just to keep a certain consistency if the bitmap is >> changed >> during the RCU read section below: not to have this bitmap being >> changed >> during the for loop here below. Before, we had this protection >> because >> pernet->lock was held. > > It is precisely because we need to maintain this consistency that we > need to copy the bitmap only once. If we copy the bitmap when dumpit is > called a second time, the bitmap obtained at this time is a new bitmap, > which destroys this consistency, right? OK, but then I have a few questions: - Was the code before my patch here prevented such consistency issues? Or said differently: is there a regression introduced by this patch? - Is it really an issue that can lead to a crash or reading freed info? - Where would you store the copy of the bitmap? In cb-args? - If we store this copy somewhere, could you not have situations where the bitmap would no longer be in sync with the addresses that are stored in 'pernet->local_addr_list', and then displaying wrong/freed info? It looks like more would be needed to cope with that. And maybe that's not worth it? >> Are you suggesting here to keep a copy in the cb, not a local one, to >> do >> the copy only once? Are you sure it is worth it (and safe)? > > Because dumpit will be called repeatedly, we need to ensure that the > same bitmap is accessed each time, so we cannot use a local one, but > need to keep a copy in the cb just like what we do in > mptcp_userspace_pm_dump_addr. > > > In addition, when doing bitmap_copy, we need to hold pernet->lock: > > bitmap = (struct mptcp_id_bitmap *)cb->ctx; > > if (!id) { > spin_lock_bh(&pernet->lock); > bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); > spin_unlock_bh(&pernet->lock); > } Ah yes, maybe, I need to check: I thought the modification of the bitmap was done atomically, and it was fine to read without the lock. But maybe not because it is stored in multiple bytes. > Also, we can put this bitmap copy code into a separate patch, which is > applied before this one. In this patch we only need to replace > spin_lock_bh with rcu_read_lock. OK so we are talking about separate fix that is not directly linked to my change, apart from the fact I modified code around, right? Cheers, Matt
Hi Matt, On Mon, 2024-10-28 at 12:48 +0100, Matthieu Baerts wrote: > Hi Geliang, > > On 28/10/2024 03:08, Geliang Tang wrote: > > Hi Matt, > > > > On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote: > > > Hi Geliang, > > > > > > Thank you for the review! > > > > > > On 25/10/2024 16:25, Geliang Tang wrote: > > > > On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) > > > > wrote: > > > > > In a few places -- to get an endpoint, dump all of them, and > > > > > change > > > > > their flags -- the list is iterated while holding the pernet- > > > > > > lock, > > > > > but > > > > > only to read the content of the list. In these cases, we can > > > > > replace > > > > > the > > > > > spin locks, by RCU read ones, and use the _rcu variants to > > > > > iterate > > > > > over > > > > > the entries list in a lockless way. > > > > > > > > > > To make it clear, the lookup helpers using the _rcu variant > > > > > are > > > > > renamed > > > > > with a _rcu suffix. The previous __lookup_addr() helper can > > > > > then > > > > > be > > > > > removed, but __lookup_addr_by_id() is still needed. > > > > > > > > > > While at it, the IDs bitmap is copied before iterating the > > > > > list > > > > > to > > > > > dump > > > > > the different addresses, to avoid any consistencies. > > > > > > > > > > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> > > > > > --- > > > > > Notes: > > > > > - This is not a fix, a small improvement for -next. > > > > > --- > > > > > net/mptcp/pm_netlink.c | 36 +++++++++++++++++++------------- > > > > > ---- > > > > > 1 file changed, 19 insertions(+), 17 deletions(-) > > > > > > > > > > diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c > > > > > index > > > > > a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b1 > > > > > 79a8 > > > > > b50e > > > > > 6965731542871 100644 > > > > > --- a/net/mptcp/pm_netlink.c > > > > > +++ b/net/mptcp/pm_netlink.c > > > > > > (...) > > > > > > > > @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct > > > > > sk_buff > > > > > *msg, > > > > > struct net *net = sock_net(msg->sk); > > > > > struct mptcp_pm_addr_entry *entry; > > > > > struct pm_nl_pernet *pernet; > > > > > + unsigned long id_bitmap[4]; > > > > > int id = cb->args[0]; > > > > > void *hdr; > > > > > int i; > > > > > > > > > > pernet = pm_nl_get_pernet(net); > > > > > + bitmap_copy(id_bitmap, pernet->id_bitmap, > > > > > MPTCP_PM_MAX_ADDR_ID + 1); > > > > > > > > I think the id bitmap should only be copied when id is 0: > > > > > > > > if (!id) > > > > bitmap_copy(id_bitmap, pernet->id_bitmap, > > > > MPTCP_PM_MAX_ADDR_ID + > > > > 1); > > > > > > > > Since this mptcp_pm_nl_dump_addr() may be called repeatedly > > > > when > > > > the > > > > dump information is very long. We only copy it the first time > > > > it is > > > > A correction is needed here. Regardless of whether the dump > > information > > is large or small, the dumpit function will be called repeatedly > > when > > dumpit returns non-zero. The loop stops when dumpit returns 0. > > > > > > called, and subsequent calls cannot copy it again. WDYT? > > > > > > Sorry, I'm not sure to understand. Here, I did a local copy of > > > 'id_bitmap' just to keep a certain consistency if the bitmap is > > > changed > > > during the RCU read section below: not to have this bitmap being > > > changed > > > during the for loop here below. Before, we had this protection > > > because > > > pernet->lock was held. > > > > It is precisely because we need to maintain this consistency that > > we > > need to copy the bitmap only once. If we copy the bitmap when > > dumpit is > > called a second time, the bitmap obtained at this time is a new > > bitmap, > > which destroys this consistency, right? > > OK, but then I have a few questions: > > - Was the code before my patch here prevented such consistency > issues? > Or said differently: is there a regression introduced by this patch? > > - Is it really an issue that can lead to a crash or reading freed > info? > > - Where would you store the copy of the bitmap? In cb-args? > > - If we store this copy somewhere, could you not have situations > where > the bitmap would no longer be in sync with the addresses that are > stored > in 'pernet->local_addr_list', and then displaying wrong/freed info? > It > looks like more would be needed to cope with that. And maybe that's > not > worth it? Your doubts are very reasonable. It seems that I have not considered it carefully enough. So we do need to define a local bitmap and copy it every time. It's fine to me and it's OK to implement BPF path manager based on it. Thanks, -Geliang > > > > Are you suggesting here to keep a copy in the cb, not a local > > > one, to > > > do > > > the copy only once? Are you sure it is worth it (and safe)? > > > > Because dumpit will be called repeatedly, we need to ensure that > > the > > same bitmap is accessed each time, so we cannot use a local one, > > but > > need to keep a copy in the cb just like what we do in > > mptcp_userspace_pm_dump_addr. > > > > > > In addition, when doing bitmap_copy, we need to hold pernet->lock: > > > > bitmap = (struct mptcp_id_bitmap *)cb->ctx; > > > > if (!id) { > > spin_lock_bh(&pernet->lock); > > bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + > > 1); > > spin_unlock_bh(&pernet->lock); > > } > > Ah yes, maybe, I need to check: I thought the modification of the > bitmap > was done atomically, and it was fine to read without the lock. But > maybe > not because it is stored in multiple bytes. > > > Also, we can put this bitmap copy code into a separate patch, which > > is > > applied before this one. In this patch we only need to replace > > spin_lock_bh with rcu_read_lock. > > OK so we are talking about separate fix that is not directly linked > to > my change, apart from the fact I modified code around, right? > > Cheers, > Matt
On Fri, 25 Oct 2024, Matthieu Baerts wrote: > On 25/10/2024 11:32, Matthieu Baerts (NGI0) wrote: >> In a few places -- to get an endpoint, dump all of them, and change >> their flags -- the list is iterated while holding the pernet->lock, but >> only to read the content of the list. In these cases, we can replace the >> spin locks, by RCU read ones, and use the _rcu variants to iterate over >> the entries list in a lockless way. >> >> To make it clear, the lookup helpers using the _rcu variant are renamed >> with a _rcu suffix. The previous __lookup_addr() helper can then be >> removed, but __lookup_addr_by_id() is still needed. >> >> While at it, the IDs bitmap is copied before iterating the list to dump >> the different addresses, to avoid any consistencies. >> >> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> >> --- >> Notes: >> - This is not a fix, a small improvement for -next. >> --- >> net/mptcp/pm_netlink.c | 36 +++++++++++++++++++----------------- >> 1 file changed, 19 insertions(+), 17 deletions(-) >> >> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c >> index a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e6965731542871 100644 >> --- a/net/mptcp/pm_netlink.c >> +++ b/net/mptcp/pm_netlink.c >> @@ -520,12 +520,12 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) >> } >> >> static struct mptcp_pm_addr_entry * >> -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) >> +__lookup_addr_by_id_rcu(struct pm_nl_pernet *pernet, unsigned int id) >> { >> struct mptcp_pm_addr_entry *entry; >> >> - list_for_each_entry(entry, &pernet->local_addr_list, list) { >> - if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) >> + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { > > I didn't see the following in the RCU doc -- but it is quite big, so I > probably missed it -- but I suppose we don't need to keep the two > helpers here with and without the _rcu variant (__lookup_addr_by_id()) > if here we add an extra lockdep condition: > > list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, > lockdep_is_held(&pernet->lock)) { > > WDYT? I like this idea, would clean things up. > >> + if (entry->addr.id == id) >> return entry; >> } >> return NULL; > > (...) > >> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, >> struct net *net = sock_net(msg->sk); >> struct mptcp_pm_addr_entry *entry; >> struct pm_nl_pernet *pernet; >> + unsigned long id_bitmap[4]; > > Oops, I left my temp declaration, it should be the following line, which > will result in the same thing anyway on x86_64, no need to re-run the > tests → I can fix that when applying the patches, or in a future version: > > DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); > Thanks for catching that. I think it would be best to have a v3 with the lockdep_is_held() approach you mentioned above. - Mat
On Tue, 29 Oct 2024, Geliang Tang wrote: > Hi Matt, > > On Mon, 2024-10-28 at 12:48 +0100, Matthieu Baerts wrote: >> Hi Geliang, >> >> On 28/10/2024 03:08, Geliang Tang wrote: >>> Hi Matt, >>> >>> On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote: >>>> Hi Geliang, >>>> >>>> Thank you for the review! >>>> >>>> On 25/10/2024 16:25, Geliang Tang wrote: >>>>> On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) >>>>> wrote: >>>>>> In a few places -- to get an endpoint, dump all of them, and >>>>>> change >>>>>> their flags -- the list is iterated while holding the pernet- >>>>>>> lock, >>>>>> but >>>>>> only to read the content of the list. In these cases, we can >>>>>> replace >>>>>> the >>>>>> spin locks, by RCU read ones, and use the _rcu variants to >>>>>> iterate >>>>>> over >>>>>> the entries list in a lockless way. >>>>>> >>>>>> To make it clear, the lookup helpers using the _rcu variant >>>>>> are >>>>>> renamed >>>>>> with a _rcu suffix. The previous __lookup_addr() helper can >>>>>> then >>>>>> be >>>>>> removed, but __lookup_addr_by_id() is still needed. >>>>>> >>>>>> While at it, the IDs bitmap is copied before iterating the >>>>>> list >>>>>> to >>>>>> dump >>>>>> the different addresses, to avoid any consistencies. >>>>>> >>>>>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> >>>>>> --- >>>>>> Notes: >>>>>> - This is not a fix, a small improvement for -next. >>>>>> --- >>>>>> net/mptcp/pm_netlink.c | 36 +++++++++++++++++++------------- >>>>>> ---- >>>>>> 1 file changed, 19 insertions(+), 17 deletions(-) >>>>>> >>>>>> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c >>>>>> index >>>>>> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b1 >>>>>> 79a8 >>>>>> b50e >>>>>> 6965731542871 100644 >>>>>> --- a/net/mptcp/pm_netlink.c >>>>>> +++ b/net/mptcp/pm_netlink.c >>>> >>>> (...) >>>> >>>>>> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct >>>>>> sk_buff >>>>>> *msg, >>>>>> struct net *net = sock_net(msg->sk); >>>>>> struct mptcp_pm_addr_entry *entry; >>>>>> struct pm_nl_pernet *pernet; >>>>>> + unsigned long id_bitmap[4]; >>>>>> int id = cb->args[0]; >>>>>> void *hdr; >>>>>> int i; >>>>>> >>>>> pernet = pm_nl_get_pernet(net); >>>>>> + bitmap_copy(id_bitmap, pernet->id_bitmap, >>>>>> MPTCP_PM_MAX_ADDR_ID + 1); >>>>> >>>>> I think the id bitmap should only be copied when id is 0: >>>>> >>>>> if (!id) >>>>> bitmap_copy(id_bitmap, pernet->id_bitmap, >>>>> MPTCP_PM_MAX_ADDR_ID + >>>>> 1); >>>>> >>>>> Since this mptcp_pm_nl_dump_addr() may be called repeatedly >>>>> when >>>>> the >>>>> dump information is very long. We only copy it the first time >>>>> it is >>> >>> A correction is needed here. Regardless of whether the dump >>> information >>> is large or small, the dumpit function will be called repeatedly >>> when >>> dumpit returns non-zero. The loop stops when dumpit returns 0. >>> >>>>> called, and subsequent calls cannot copy it again. WDYT? >>>> >>>> Sorry, I'm not sure to understand. Here, I did a local copy of >>>> 'id_bitmap' just to keep a certain consistency if the bitmap is >>>> changed >>>> during the RCU read section below: not to have this bitmap being >>>> changed >>>> during the for loop here below. Before, we had this protection >>>> because >>>> pernet->lock was held. >>> >>> It is precisely because we need to maintain this consistency that >>> we >>> need to copy the bitmap only once. If we copy the bitmap when >>> dumpit is >>> called a second time, the bitmap obtained at this time is a new >>> bitmap, >>> which destroys this consistency, right? >> >> OK, but then I have a few questions: >> >> - Was the code before my patch here prevented such consistency >> issues? >> Or said differently: is there a regression introduced by this patch? >> >> - Is it really an issue that can lead to a crash or reading freed >> info? >> >> - Where would you store the copy of the bitmap? In cb-args? >> >> - If we store this copy somewhere, could you not have situations >> where >> the bitmap would no longer be in sync with the addresses that are >> stored >> in 'pernet->local_addr_list', and then displaying wrong/freed info? >> It >> looks like more would be needed to cope with that. And maybe that's >> not >> worth it? > > Your doubts are very reasonable. It seems that I have not considered it > carefully enough. So we do need to define a local bitmap and copy it > every time. It's fine to me and it's OK to implement BPF path manager > based on it. > Matthieu and Geliang - I think the only way to guarantee strict consistency would be to acquire the lock at the beginning of the dump and make a complete copy of both the bitmap and the address list at that moment. Even with all the work to achieve this "strictly consistent" snapshot, the userspace caller doesn't have any guarantee that the dumped data is accurate at the moment the netlink response completes. The good news is that I don't think this strict consistency is needed for the dump function. If the list is being modified concurrently, it's not a big problem to miss newly-added entries or show recently-deleted ones. Since bitmap_copy() is non-atomic, my suggestion is to not copy the bitmap. Instead use test_bit() (which is atomic) directly on pernet->id_bitmap without locking. Then, make sure that inconsistencies between the bitmap and the local_addr_list are handled cleanly. The two cases are: 1. Bitmap has a 0 even though there's a new entry in local_addr_list. In this case it gets skipped. No problem here. 2. Bitmap has a 1 even though there's no entry in local_addr_list (ignore it!), or there's an entry that is in the process of getting freed. For the latter, RCU will ensure there's no risk of an invalid dereference. If we want to detect a deletion-in-progress, we would have to add an atomic flag to 'struct mptcp_pm_addr_entry' that is set before the deletion code calls list_del_rcu(). Even if the address is in this state, I would argue that the just-barely-stale data is fine to show. Sound reasonable? Do you see a reason that bitmap/addr_list inconsistencies would cause issues? Thanks - Mat >> >>>> Are you suggesting here to keep a copy in the cb, not a local >>>> one, to >>>> do >>>> the copy only once? Are you sure it is worth it (and safe)? >>> >>> Because dumpit will be called repeatedly, we need to ensure that >>> the >>> same bitmap is accessed each time, so we cannot use a local one, >>> but >>> need to keep a copy in the cb just like what we do in >>> mptcp_userspace_pm_dump_addr. >>> >>> >>> In addition, when doing bitmap_copy, we need to hold pernet->lock: >>> >>> bitmap = (struct mptcp_id_bitmap *)cb->ctx; >>> >>> if (!id) { >>> spin_lock_bh(&pernet->lock); >>> bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + >>> 1); >>> spin_unlock_bh(&pernet->lock); >>> } >> >> Ah yes, maybe, I need to check: I thought the modification of the >> bitmap >> was done atomically, and it was fine to read without the lock. But >> maybe >> not because it is stored in multiple bytes. >> >>> Also, we can put this bitmap copy code into a separate patch, which >>> is >>> applied before this one. In this patch we only need to replace >>> spin_lock_bh with rcu_read_lock. >> >> OK so we are talking about separate fix that is not directly linked >> to >> my change, apart from the fact I modified code around, right? >> >> Cheers, >> Matt > > >
On 10/25/24 11:32, Matthieu Baerts (NGI0) wrote: > @@ -2060,17 +2062,17 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) > if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) > bkup = 1; > > - spin_lock_bh(&pernet->lock); > - entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : > - __lookup_addr(pernet, &addr.addr); > + rcu_read_lock(); > + entry = lookup_by_id ? __lookup_addr_by_id_rcu(pernet, addr.addr.id) : > + __lookup_addr_rcu(pernet, &addr.addr); > if (!entry) { > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > GENL_SET_ERR_MSG(info, "address not found"); > return -EINVAL; > } > if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && > (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > GENL_SET_ERR_MSG(info, "invalid addr flags"); > return -EINVAL; > } > @@ -2078,7 +2080,7 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) > changed = (addr.flags ^ entry->flags) & mask; > entry->flags = (entry->flags & ~mask) | (addr.flags & mask); > addr = *entry; > - spin_unlock_bh(&pernet->lock); > + rcu_read_unlock(); > > mptcp_nl_set_flags(net, &addr.addr, bkup, changed); > return 0; > I think we must retain the lock in this function, otherwise we could end-up with an unexpected flag combination. i.e. set_flag(MPTCP_PM_ADDR_FLAG_FULLMESH) and set_flag(MPTCP_PM_ADDR_FLAG_SIGNAL) run concurrently on CPU1 and CPU2. entry->flags could end-up having a single bit set instead of both, if both CPUs read the entry before the other would store the new value. I don't think _ONCE() annotations will be enough to avoid such thing without a lock. Cheers, Paolo
Hi Mat, On 01/11/2024 00:24, Mat Martineau wrote: > On Tue, 29 Oct 2024, Geliang Tang wrote: > >> Hi Matt, >> >> On Mon, 2024-10-28 at 12:48 +0100, Matthieu Baerts wrote: >>> Hi Geliang, >>> >>> On 28/10/2024 03:08, Geliang Tang wrote: >>>> Hi Matt, >>>> >>>> On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote: >>>>> Hi Geliang, >>>>> >>>>> Thank you for the review! >>>>> >>>>> On 25/10/2024 16:25, Geliang Tang wrote: >>>>>> On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) >>>>>> wrote: >>>>>>> In a few places -- to get an endpoint, dump all of them, and >>>>>>> change >>>>>>> their flags -- the list is iterated while holding the pernet- >>>>>>>> lock, >>>>>>> but >>>>>>> only to read the content of the list. In these cases, we can >>>>>>> replace >>>>>>> the >>>>>>> spin locks, by RCU read ones, and use the _rcu variants to >>>>>>> iterate >>>>>>> over >>>>>>> the entries list in a lockless way. >>>>>>> >>>>>>> To make it clear, the lookup helpers using the _rcu variant >>>>>>> are >>>>>>> renamed >>>>>>> with a _rcu suffix. The previous __lookup_addr() helper can >>>>>>> then >>>>>>> be >>>>>>> removed, but __lookup_addr_by_id() is still needed. >>>>>>> >>>>>>> While at it, the IDs bitmap is copied before iterating the >>>>>>> list >>>>>>> to >>>>>>> dump >>>>>>> the different addresses, to avoid any consistencies. >>>>>>> >>>>>>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> >>>>>>> --- >>>>>>> Notes: >>>>>>> - This is not a fix, a small improvement for -next. >>>>>>> --- >>>>>>> net/mptcp/pm_netlink.c | 36 +++++++++++++++++++------------- >>>>>>> ---- >>>>>>> 1 file changed, 19 insertions(+), 17 deletions(-) >>>>>>> >>>>>>> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c >>>>>>> index >>>>>>> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b1 >>>>>>> 79a8 >>>>>>> b50e >>>>>>> 6965731542871 100644 >>>>>>> --- a/net/mptcp/pm_netlink.c >>>>>>> +++ b/net/mptcp/pm_netlink.c >>>>> >>>>> (...) >>>>> >>>>>>> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct >>>>>>> sk_buff >>>>>>> *msg, >>>>>>> struct net *net = sock_net(msg->sk); >>>>>>> struct mptcp_pm_addr_entry *entry; >>>>>>> struct pm_nl_pernet *pernet; >>>>>>> + unsigned long id_bitmap[4]; >>>>>>> int id = cb->args[0]; >>>>>>> void *hdr; >>>>>>> int i; >>>>>>> >>>>>> pernet = pm_nl_get_pernet(net); >>>>>>> + bitmap_copy(id_bitmap, pernet->id_bitmap, >>>>>>> MPTCP_PM_MAX_ADDR_ID + 1); >>>>>> >>>>>> I think the id bitmap should only be copied when id is 0: >>>>>> >>>>>> if (!id) >>>>>> bitmap_copy(id_bitmap, pernet->id_bitmap, >>>>>> MPTCP_PM_MAX_ADDR_ID + >>>>>> 1); >>>>>> >>>>>> Since this mptcp_pm_nl_dump_addr() may be called repeatedly >>>>>> when >>>>>> the >>>>>> dump information is very long. We only copy it the first time >>>>>> it is >>>> >>>> A correction is needed here. Regardless of whether the dump >>>> information >>>> is large or small, the dumpit function will be called repeatedly >>>> when >>>> dumpit returns non-zero. The loop stops when dumpit returns 0. >>>> >>>>>> called, and subsequent calls cannot copy it again. WDYT? >>>>> >>>>> Sorry, I'm not sure to understand. Here, I did a local copy of >>>>> 'id_bitmap' just to keep a certain consistency if the bitmap is >>>>> changed >>>>> during the RCU read section below: not to have this bitmap being >>>>> changed >>>>> during the for loop here below. Before, we had this protection >>>>> because >>>>> pernet->lock was held. >>>> >>>> It is precisely because we need to maintain this consistency that >>>> we >>>> need to copy the bitmap only once. If we copy the bitmap when >>>> dumpit is >>>> called a second time, the bitmap obtained at this time is a new >>>> bitmap, >>>> which destroys this consistency, right? >>> >>> OK, but then I have a few questions: >>> >>> - Was the code before my patch here prevented such consistency >>> issues? >>> Or said differently: is there a regression introduced by this patch? >>> >>> - Is it really an issue that can lead to a crash or reading freed >>> info? >>> >>> - Where would you store the copy of the bitmap? In cb-args? >>> >>> - If we store this copy somewhere, could you not have situations >>> where >>> the bitmap would no longer be in sync with the addresses that are >>> stored >>> in 'pernet->local_addr_list', and then displaying wrong/freed info? >>> It >>> looks like more would be needed to cope with that. And maybe that's >>> not >>> worth it? >> >> Your doubts are very reasonable. It seems that I have not considered it >> carefully enough. So we do need to define a local bitmap and copy it >> every time. It's fine to me and it's OK to implement BPF path manager >> based on it. >> > > Matthieu and Geliang - > > I think the only way to guarantee strict consistency would be to acquire > the lock at the beginning of the dump and make a complete copy of both > the bitmap and the address list at that moment. Even with all the work > to achieve this "strictly consistent" snapshot, the userspace caller > doesn't have any guarantee that the dumped data is accurate at the > moment the netlink response completes. > > > The good news is that I don't think this strict consistency is needed > for the dump function. If the list is being modified concurrently, it's > not a big problem to miss newly-added entries or show recently-deleted > ones. > > > Since bitmap_copy() is non-atomic, my suggestion is to not copy the > bitmap. Instead use test_bit() (which is atomic) directly on pernet- >>id_bitmap without locking. > > Then, make sure that inconsistencies between the bitmap and the > local_addr_list are handled cleanly. The two cases are: > > 1. Bitmap has a 0 even though there's a new entry in local_addr_list. In > this case it gets skipped. No problem here. > > 2. Bitmap has a 1 even though there's no entry in local_addr_list > (ignore it!), or there's an entry that is in the process of getting > freed. For the latter, RCU will ensure there's no risk of an invalid > dereference. If we want to detect a deletion-in-progress, we would have > to add an atomic flag to 'struct mptcp_pm_addr_entry' that is set before > the deletion code calls list_del_rcu(). Even if the address is in this > state, I would argue that the just-barely-stale data is fine to show. Thank you for your review! > Sound reasonable? Do you see a reason that bitmap/addr_list > inconsistencies would cause issues? Yes, that's sounds reasonable, as long as we don't try to access freed resources, I don't think we need to worry about inconsistencies if there are ongoing changes while the userspace is requesting the list. When I did the copy of the bitmap, it was mainly "as a matter of principle", but if it is not needed, that's even simpler. Cheers, Matt
Hi Paolo, Thank you for the review! On 05/11/2024 19:21, Paolo Abeni wrote: > On 10/25/24 11:32, Matthieu Baerts (NGI0) wrote: >> @@ -2060,17 +2062,17 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) >> if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) >> bkup = 1; >> >> - spin_lock_bh(&pernet->lock); >> - entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : >> - __lookup_addr(pernet, &addr.addr); >> + rcu_read_lock(); >> + entry = lookup_by_id ? __lookup_addr_by_id_rcu(pernet, addr.addr.id) : >> + __lookup_addr_rcu(pernet, &addr.addr); >> if (!entry) { >> - spin_unlock_bh(&pernet->lock); >> + rcu_read_unlock(); >> GENL_SET_ERR_MSG(info, "address not found"); >> return -EINVAL; >> } >> if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && >> (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { >> - spin_unlock_bh(&pernet->lock); >> + rcu_read_unlock(); >> GENL_SET_ERR_MSG(info, "invalid addr flags"); >> return -EINVAL; >> } >> @@ -2078,7 +2080,7 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) >> changed = (addr.flags ^ entry->flags) & mask; >> entry->flags = (entry->flags & ~mask) | (addr.flags & mask); >> addr = *entry; >> - spin_unlock_bh(&pernet->lock); >> + rcu_read_unlock(); >> >> mptcp_nl_set_flags(net, &addr.addr, bkup, changed); >> return 0; >> > > I think we must retain the lock in this function, otherwise we could > end-up with an unexpected flag combination. > > i.e. set_flag(MPTCP_PM_ADDR_FLAG_FULLMESH) and > set_flag(MPTCP_PM_ADDR_FLAG_SIGNAL) run concurrently on CPU1 and CPU2. > entry->flags could end-up having a single bit set instead of both, if > both CPUs read the entry before the other would store the new value. Good point, I didn't think about this case where two commands would be done in parallel from the userspace. But can we get this case? By default, are the commands not serialised? (linked to parallel_ops member in struct genl_family?) In this case, is it OK to switch to rcu_read_lock? Or is it maybe not worth it? > I don't think _ONCE() annotations will be enough to avoid such thing > without a lock. Cheers, Matt
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e6965731542871 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -520,12 +520,12 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) } static struct mptcp_pm_addr_entry * -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) +__lookup_addr_by_id_rcu(struct pm_nl_pernet *pernet, unsigned int id) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &pernet->local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (entry->addr.id == id) return entry; } return NULL; @@ -1836,8 +1836,8 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) goto fail; } - spin_lock_bh(&pernet->lock); - entry = __lookup_addr_by_id(pernet, addr.addr.id); + rcu_read_lock(); + entry = __lookup_addr_by_id_rcu(pernet, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); ret = -EINVAL; @@ -1850,11 +1850,11 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); return ret; unlock_fail: - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); fail: nlmsg_free(msg); @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; + unsigned long id_bitmap[4]; int id = cb->args[0]; void *hdr; int i; pernet = pm_nl_get_pernet(net); + bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - spin_lock_bh(&pernet->lock); + rcu_read_lock(); for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { - if (test_bit(i, pernet->id_bitmap)) { - entry = __lookup_addr_by_id(pernet, i); + if (test_bit(i, id_bitmap)) { + entry = __lookup_addr_by_id_rcu(pernet, i); if (!entry) break; @@ -1903,7 +1905,7 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, genlmsg_end(msg, hdr); } } - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); cb->args[0] = id; return msg->len; @@ -2060,17 +2062,17 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; - spin_lock_bh(&pernet->lock); - entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : - __lookup_addr(pernet, &addr.addr); + rcu_read_lock(); + entry = lookup_by_id ? __lookup_addr_by_id_rcu(pernet, addr.addr.id) : + __lookup_addr_rcu(pernet, &addr.addr); if (!entry) { - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); GENL_SET_ERR_MSG(info, "address not found"); return -EINVAL; } if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); GENL_SET_ERR_MSG(info, "invalid addr flags"); return -EINVAL; } @@ -2078,7 +2080,7 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) changed = (addr.flags ^ entry->flags) & mask; entry->flags = (entry->flags & ~mask) | (addr.flags & mask); addr = *entry; - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); mptcp_nl_set_flags(net, &addr.addr, bkup, changed); return 0;
In a few places -- to get an endpoint, dump all of them, and change their flags -- the list is iterated while holding the pernet->lock, but only to read the content of the list. In these cases, we can replace the spin locks, by RCU read ones, and use the _rcu variants to iterate over the entries list in a lockless way. To make it clear, the lookup helpers using the _rcu variant are renamed with a _rcu suffix. The previous __lookup_addr() helper can then be removed, but __lookup_addr_by_id() is still needed. While at it, the IDs bitmap is copied before iterating the list to dump the different addresses, to avoid any consistencies. Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> --- Notes: - This is not a fix, a small improvement for -next. --- net/mptcp/pm_netlink.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-)