diff mbox series

[mptcp-net,v2,2/3] mptcp: pm: lockless list traversal

Message ID 20241025-mptcp-pm-lookup_addr_rcu-v2-2-1478f6c4b205@kernel.org (mailing list archive)
State Superseded, archived
Headers show
Series mptcp: pm: use _rcu variant under rcu_read_lock | expand

Checks

Context Check Description
matttbe/build success Build and static analysis OK
matttbe/checkpatch success total: 0 errors, 0 warnings, 0 checks, 97 lines checked
matttbe/shellcheck success MPTCP selftests files have not been modified
matttbe/KVM_Validation__normal success Success! ✅
matttbe/KVM_Validation__debug success Success! ✅
matttbe/KVM_Validation__btf-normal__only_bpftest_all_ success Success! ✅
matttbe/KVM_Validation__btf-debug__only_bpftest_all_ success Success! ✅

Commit Message

Matthieu Baerts Oct. 25, 2024, 9:32 a.m. UTC
In a few places -- to get an endpoint, dump all of them, and change
their flags -- the list is iterated while holding the pernet->lock, but
only to read the content of the list. In these cases, we can replace the
spin locks, by RCU read ones, and use the _rcu variants to iterate over
the entries list in a lockless way.

To make it clear, the lookup helpers using the _rcu variant are renamed
with a _rcu suffix. The previous __lookup_addr() helper can then be
removed, but __lookup_addr_by_id() is still needed.

While at it, the IDs bitmap is copied before iterating the list to dump
the different addresses, to avoid any consistencies.

Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
---
Notes:
 - This is not a fix, a small improvement for -next.
---
 net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

Comments

Geliang Tang Oct. 25, 2024, 2:25 p.m. UTC | #1
Hi Matt,

Thanks for this patch.

On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) wrote:
> In a few places -- to get an endpoint, dump all of them, and change
> their flags -- the list is iterated while holding the pernet->lock,
> but
> only to read the content of the list. In these cases, we can replace
> the
> spin locks, by RCU read ones, and use the _rcu variants to iterate
> over
> the entries list in a lockless way.
> 
> To make it clear, the lookup helpers using the _rcu variant are
> renamed
> with a _rcu suffix. The previous __lookup_addr() helper can then be
> removed, but __lookup_addr_by_id() is still needed.
> 
> While at it, the IDs bitmap is copied before iterating the list to
> dump
> the different addresses, to avoid any consistencies.
> 
> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> ---
> Notes:
>  - This is not a fix, a small improvement for -next.
> ---
>  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-----------------
>  1 file changed, 19 insertions(+), 17 deletions(-)
> 
> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
> index
> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e
> 6965731542871 100644
> --- a/net/mptcp/pm_netlink.c
> +++ b/net/mptcp/pm_netlink.c
> @@ -520,12 +520,12 @@ __lookup_addr_by_id(struct pm_nl_pernet
> *pernet, unsigned int id)
>  }
>  
>  static struct mptcp_pm_addr_entry *
> -__lookup_addr(struct pm_nl_pernet *pernet, const struct
> mptcp_addr_info *info)
> +__lookup_addr_by_id_rcu(struct pm_nl_pernet *pernet, unsigned int
> id)
>  {
>  	struct mptcp_pm_addr_entry *entry;
>  
> -	list_for_each_entry(entry, &pernet->local_addr_list, list) {
> -		if (mptcp_addresses_equal(&entry->addr, info, entry-
> >addr.port))
> +	list_for_each_entry_rcu(entry, &pernet->local_addr_list,
> list) {
> +		if (entry->addr.id == id)
>  			return entry;
>  	}
>  	return NULL;
> @@ -1836,8 +1836,8 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb,
> struct genl_info *info)
>  		goto fail;
>  	}
>  
> -	spin_lock_bh(&pernet->lock);
> -	entry = __lookup_addr_by_id(pernet, addr.addr.id);
> +	rcu_read_lock();
> +	entry = __lookup_addr_by_id_rcu(pernet, addr.addr.id);
>  	if (!entry) {
>  		GENL_SET_ERR_MSG(info, "address not found");
>  		ret = -EINVAL;
> @@ -1850,11 +1850,11 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb,
> struct genl_info *info)
>  
>  	genlmsg_end(msg, reply);
>  	ret = genlmsg_reply(msg, info);
> -	spin_unlock_bh(&pernet->lock);
> +	rcu_read_unlock();
>  	return ret;
>  
>  unlock_fail:
> -	spin_unlock_bh(&pernet->lock);
> +	rcu_read_unlock();
>  
>  fail:
>  	nlmsg_free(msg);
> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff
> *msg,
>  	struct net *net = sock_net(msg->sk);
>  	struct mptcp_pm_addr_entry *entry;
>  	struct pm_nl_pernet *pernet;
> +	unsigned long id_bitmap[4];
>  	int id = cb->args[0];
>  	void *hdr;
>  	int i;
>  
>  	pernet = pm_nl_get_pernet(net);
> +	bitmap_copy(id_bitmap, pernet->id_bitmap,
> MPTCP_PM_MAX_ADDR_ID + 1);

I think the id bitmap should only be copied when id is 0:

if (!id)
  bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);

Since this mptcp_pm_nl_dump_addr() may be called repeatedly when the
dump information is very long. We only copy it the first time it is
called, and subsequent calls cannot copy it again. WDYT?

Thanks,
-Geliang

>  
> -	spin_lock_bh(&pernet->lock);
> +	rcu_read_lock();
>  	for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
> -		if (test_bit(i, pernet->id_bitmap)) {
> -			entry = __lookup_addr_by_id(pernet, i);
> +		if (test_bit(i, id_bitmap)) {
> +			entry = __lookup_addr_by_id_rcu(pernet, i);
>  			if (!entry)
>  				break;
>  
> @@ -1903,7 +1905,7 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
>  			genlmsg_end(msg, hdr);
>  		}
>  	}
> -	spin_unlock_bh(&pernet->lock);
> +	rcu_read_unlock();
>  
>  	cb->args[0] = id;
>  	return msg->len;
> @@ -2060,17 +2062,17 @@ int mptcp_pm_nl_set_flags(struct sk_buff
> *skb, struct genl_info *info)
>  	if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
>  		bkup = 1;
>  
> -	spin_lock_bh(&pernet->lock);
> -	entry = lookup_by_id ? __lookup_addr_by_id(pernet,
> addr.addr.id) :
> -			       __lookup_addr(pernet, &addr.addr);
> +	rcu_read_lock();
> +	entry = lookup_by_id ? __lookup_addr_by_id_rcu(pernet,
> addr.addr.id) :
> +			       __lookup_addr_rcu(pernet,
> &addr.addr);
>  	if (!entry) {
> -		spin_unlock_bh(&pernet->lock);
> +		rcu_read_unlock();
>  		GENL_SET_ERR_MSG(info, "address not found");
>  		return -EINVAL;
>  	}
>  	if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
>  	    (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
> -		spin_unlock_bh(&pernet->lock);
> +		rcu_read_unlock();
>  		GENL_SET_ERR_MSG(info, "invalid addr flags");
>  		return -EINVAL;
>  	}
> @@ -2078,7 +2080,7 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb,
> struct genl_info *info)
>  	changed = (addr.flags ^ entry->flags) & mask;
>  	entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
>  	addr = *entry;
> -	spin_unlock_bh(&pernet->lock);
> +	rcu_read_unlock();
>  
>  	mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
>  	return 0;
>
Matthieu Baerts Oct. 25, 2024, 3:17 p.m. UTC | #2
On 25/10/2024 11:32, Matthieu Baerts (NGI0) wrote:
> In a few places -- to get an endpoint, dump all of them, and change
> their flags -- the list is iterated while holding the pernet->lock, but
> only to read the content of the list. In these cases, we can replace the
> spin locks, by RCU read ones, and use the _rcu variants to iterate over
> the entries list in a lockless way.
> 
> To make it clear, the lookup helpers using the _rcu variant are renamed
> with a _rcu suffix. The previous __lookup_addr() helper can then be
> removed, but __lookup_addr_by_id() is still needed.
> 
> While at it, the IDs bitmap is copied before iterating the list to dump
> the different addresses, to avoid any consistencies.
> 
> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> ---
> Notes:
>  - This is not a fix, a small improvement for -next.
> ---
>  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-----------------
>  1 file changed, 19 insertions(+), 17 deletions(-)
> 
> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
> index a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e6965731542871 100644
> --- a/net/mptcp/pm_netlink.c
> +++ b/net/mptcp/pm_netlink.c
> @@ -520,12 +520,12 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
>  }
>  
>  static struct mptcp_pm_addr_entry *
> -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
> +__lookup_addr_by_id_rcu(struct pm_nl_pernet *pernet, unsigned int id)
>  {
>  	struct mptcp_pm_addr_entry *entry;
>  
> -	list_for_each_entry(entry, &pernet->local_addr_list, list) {
> -		if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
> +	list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {

I didn't see the following in the RCU doc -- but it is quite big, so I
probably missed it -- but I suppose we don't need to keep the two
helpers here with and without the _rcu variant (__lookup_addr_by_id())
if here we add an extra lockdep condition:

  list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
                          lockdep_is_held(&pernet->lock)) {

WDYT?

> +		if (entry->addr.id == id)
>  			return entry;
>  	}
>  	return NULL;

(...)

> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
>  	struct net *net = sock_net(msg->sk);
>  	struct mptcp_pm_addr_entry *entry;
>  	struct pm_nl_pernet *pernet;
> +	unsigned long id_bitmap[4];

Oops, I left my temp declaration, it should be the following line, which
will result in the same thing anyway on x86_64, no need to re-run the
tests → I can fix that when applying the patches, or in a future version:

  DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);

Cheers,
Matt
Matthieu Baerts Oct. 25, 2024, 3:26 p.m. UTC | #3
Hi Geliang,

Thank you for the review!

On 25/10/2024 16:25, Geliang Tang wrote:
> On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) wrote:
>> In a few places -- to get an endpoint, dump all of them, and change
>> their flags -- the list is iterated while holding the pernet->lock,
>> but
>> only to read the content of the list. In these cases, we can replace
>> the
>> spin locks, by RCU read ones, and use the _rcu variants to iterate
>> over
>> the entries list in a lockless way.
>>
>> To make it clear, the lookup helpers using the _rcu variant are
>> renamed
>> with a _rcu suffix. The previous __lookup_addr() helper can then be
>> removed, but __lookup_addr_by_id() is still needed.
>>
>> While at it, the IDs bitmap is copied before iterating the list to
>> dump
>> the different addresses, to avoid any consistencies.
>>
>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
>> ---
>> Notes:
>>  - This is not a fix, a small improvement for -next.
>> ---
>>  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-----------------
>>  1 file changed, 19 insertions(+), 17 deletions(-)
>>
>> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
>> index
>> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e
>> 6965731542871 100644
>> --- a/net/mptcp/pm_netlink.c
>> +++ b/net/mptcp/pm_netlink.c

(...)

>> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff
>> *msg,
>>  	struct net *net = sock_net(msg->sk);
>>  	struct mptcp_pm_addr_entry *entry;
>>  	struct pm_nl_pernet *pernet;
>> +	unsigned long id_bitmap[4];
>>  	int id = cb->args[0];
>>  	void *hdr;
>>  	int i;
>>  
>>  	pernet = pm_nl_get_pernet(net);
>> +	bitmap_copy(id_bitmap, pernet->id_bitmap,
>> MPTCP_PM_MAX_ADDR_ID + 1);
> 
> I think the id bitmap should only be copied when id is 0:
> 
> if (!id)
>   bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
> 
> Since this mptcp_pm_nl_dump_addr() may be called repeatedly when the
> dump information is very long. We only copy it the first time it is
> called, and subsequent calls cannot copy it again. WDYT?

Sorry, I'm not sure to understand. Here, I did a local copy of
'id_bitmap' just to keep a certain consistency if the bitmap is changed
during the RCU read section below: not to have this bitmap being changed
during the for loop here below. Before, we had this protection because
pernet->lock was held.

Are you suggesting here to keep a copy in the cb, not a local one, to do
the copy only once? Are you sure it is worth it (and safe)?

>>  
>> -	spin_lock_bh(&pernet->lock);
>> +	rcu_read_lock();
>>  	for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
>> -		if (test_bit(i, pernet->id_bitmap)) {
>> -			entry = __lookup_addr_by_id(pernet, i);
>> +		if (test_bit(i, id_bitmap)) {
>> +			entry = __lookup_addr_by_id_rcu(pernet, i);
>>  			if (!entry)
>>  				break;
>>  
Cheers,
Matt
Geliang Tang Oct. 28, 2024, 2:08 a.m. UTC | #4
Hi Matt,

On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote:
> Hi Geliang,
> 
> Thank you for the review!
> 
> On 25/10/2024 16:25, Geliang Tang wrote:
> > On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) wrote:
> > > In a few places -- to get an endpoint, dump all of them, and
> > > change
> > > their flags -- the list is iterated while holding the pernet-
> > > >lock,
> > > but
> > > only to read the content of the list. In these cases, we can
> > > replace
> > > the
> > > spin locks, by RCU read ones, and use the _rcu variants to
> > > iterate
> > > over
> > > the entries list in a lockless way.
> > > 
> > > To make it clear, the lookup helpers using the _rcu variant are
> > > renamed
> > > with a _rcu suffix. The previous __lookup_addr() helper can then
> > > be
> > > removed, but __lookup_addr_by_id() is still needed.
> > > 
> > > While at it, the IDs bitmap is copied before iterating the list
> > > to
> > > dump
> > > the different addresses, to avoid any consistencies.
> > > 
> > > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> > > ---
> > > Notes:
> > >  - This is not a fix, a small improvement for -next.
> > > ---
> > >  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-----------------
> > >  1 file changed, 19 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
> > > index
> > > a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8
> > > b50e
> > > 6965731542871 100644
> > > --- a/net/mptcp/pm_netlink.c
> > > +++ b/net/mptcp/pm_netlink.c
> 
> (...)
> 
> > > @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff
> > > *msg,
> > >  	struct net *net = sock_net(msg->sk);
> > >  	struct mptcp_pm_addr_entry *entry;
> > >  	struct pm_nl_pernet *pernet;
> > > +	unsigned long id_bitmap[4];
> > >  	int id = cb->args[0];
> > >  	void *hdr;
> > >  	int i;
> > >  
> > >  	pernet = pm_nl_get_pernet(net);
> > > +	bitmap_copy(id_bitmap, pernet->id_bitmap,
> > > MPTCP_PM_MAX_ADDR_ID + 1);
> > 
> > I think the id bitmap should only be copied when id is 0:
> > 
> > if (!id)
> >   bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID +
> > 1);
> > 
> > Since this mptcp_pm_nl_dump_addr() may be called repeatedly when
> > the
> > dump information is very long. We only copy it the first time it is

A correction is needed here. Regardless of whether the dump information
is large or small, the dumpit function will be called repeatedly when
dumpit returns non-zero. The loop stops when dumpit returns 0.

> > called, and subsequent calls cannot copy it again. WDYT?
> 
> Sorry, I'm not sure to understand. Here, I did a local copy of
> 'id_bitmap' just to keep a certain consistency if the bitmap is
> changed
> during the RCU read section below: not to have this bitmap being
> changed
> during the for loop here below. Before, we had this protection
> because
> pernet->lock was held.

It is precisely because we need to maintain this consistency that we
need to copy the bitmap only once. If we copy the bitmap when dumpit is
called a second time, the bitmap obtained at this time is a new bitmap,
which destroys this consistency, right?

> 
> Are you suggesting here to keep a copy in the cb, not a local one, to
> do
> the copy only once? Are you sure it is worth it (and safe)?

Because dumpit will be called repeatedly, we need to ensure that the
same bitmap is accessed each time, so we cannot use a local one, but
need to keep a copy in the cb just like what we do in
mptcp_userspace_pm_dump_addr.


In addition, when doing bitmap_copy, we need to hold pernet->lock:

bitmap = (struct mptcp_id_bitmap *)cb->ctx;

if (!id) {
  spin_lock_bh(&pernet->lock);
  bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
  spin_unlock_bh(&pernet->lock);
}

Also, we can put this bitmap copy code into a separate patch, which is
applied before this one. In this patch we only need to replace
spin_lock_bh with rcu_read_lock.

WDYT?

Thanks,
-Geliang

> 
> > >  
> > > -	spin_lock_bh(&pernet->lock);
> > > +	rcu_read_lock();
> > >  	for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
> > > -		if (test_bit(i, pernet->id_bitmap)) {
> > > -			entry = __lookup_addr_by_id(pernet, i);
> > > +		if (test_bit(i, id_bitmap)) {
> > > +			entry = __lookup_addr_by_id_rcu(pernet,
> > > i);
> > >  			if (!entry)
> > >  				break;
> > >  
> Cheers,
> Matt
Matthieu Baerts Oct. 28, 2024, 11:48 a.m. UTC | #5
Hi Geliang,

On 28/10/2024 03:08, Geliang Tang wrote:
> Hi Matt,
> 
> On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote:
>> Hi Geliang,
>>
>> Thank you for the review!
>>
>> On 25/10/2024 16:25, Geliang Tang wrote:
>>> On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0) wrote:
>>>> In a few places -- to get an endpoint, dump all of them, and
>>>> change
>>>> their flags -- the list is iterated while holding the pernet-
>>>>> lock,
>>>> but
>>>> only to read the content of the list. In these cases, we can
>>>> replace
>>>> the
>>>> spin locks, by RCU read ones, and use the _rcu variants to
>>>> iterate
>>>> over
>>>> the entries list in a lockless way.
>>>>
>>>> To make it clear, the lookup helpers using the _rcu variant are
>>>> renamed
>>>> with a _rcu suffix. The previous __lookup_addr() helper can then
>>>> be
>>>> removed, but __lookup_addr_by_id() is still needed.
>>>>
>>>> While at it, the IDs bitmap is copied before iterating the list
>>>> to
>>>> dump
>>>> the different addresses, to avoid any consistencies.
>>>>
>>>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
>>>> ---
>>>> Notes:
>>>>  - This is not a fix, a small improvement for -next.
>>>> ---
>>>>  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-----------------
>>>>  1 file changed, 19 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
>>>> index
>>>> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8
>>>> b50e
>>>> 6965731542871 100644
>>>> --- a/net/mptcp/pm_netlink.c
>>>> +++ b/net/mptcp/pm_netlink.c
>>
>> (...)
>>
>>>> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff
>>>> *msg,
>>>>  	struct net *net = sock_net(msg->sk);
>>>>  	struct mptcp_pm_addr_entry *entry;
>>>>  	struct pm_nl_pernet *pernet;
>>>> +	unsigned long id_bitmap[4];
>>>>  	int id = cb->args[0];
>>>>  	void *hdr;
>>>>  	int i;
>>>>  
>>>>  	pernet = pm_nl_get_pernet(net);
>>>> +	bitmap_copy(id_bitmap, pernet->id_bitmap,
>>>> MPTCP_PM_MAX_ADDR_ID + 1);
>>>
>>> I think the id bitmap should only be copied when id is 0:
>>>
>>> if (!id)
>>>   bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID +
>>> 1);
>>>
>>> Since this mptcp_pm_nl_dump_addr() may be called repeatedly when
>>> the
>>> dump information is very long. We only copy it the first time it is
> 
> A correction is needed here. Regardless of whether the dump information
> is large or small, the dumpit function will be called repeatedly when
> dumpit returns non-zero. The loop stops when dumpit returns 0.
> 
>>> called, and subsequent calls cannot copy it again. WDYT?
>>
>> Sorry, I'm not sure to understand. Here, I did a local copy of
>> 'id_bitmap' just to keep a certain consistency if the bitmap is
>> changed
>> during the RCU read section below: not to have this bitmap being
>> changed
>> during the for loop here below. Before, we had this protection
>> because
>> pernet->lock was held.
> 
> It is precisely because we need to maintain this consistency that we
> need to copy the bitmap only once. If we copy the bitmap when dumpit is
> called a second time, the bitmap obtained at this time is a new bitmap,
> which destroys this consistency, right?

OK, but then I have a few questions:

- Was the code before my patch here prevented such consistency issues?
Or said differently: is there a regression introduced by this patch?

- Is it really an issue that can lead to a crash or reading freed info?

- Where would you store the copy of the bitmap? In cb-args?

- If we store this copy somewhere, could you not have situations where
the bitmap would no longer be in sync with the addresses that are stored
in 'pernet->local_addr_list', and then displaying wrong/freed info? It
looks like more would be needed to cope with that. And maybe that's not
worth it?

>> Are you suggesting here to keep a copy in the cb, not a local one, to
>> do
>> the copy only once? Are you sure it is worth it (and safe)?
> 
> Because dumpit will be called repeatedly, we need to ensure that the
> same bitmap is accessed each time, so we cannot use a local one, but
> need to keep a copy in the cb just like what we do in
> mptcp_userspace_pm_dump_addr.
> 
> 
> In addition, when doing bitmap_copy, we need to hold pernet->lock:
> 
> bitmap = (struct mptcp_id_bitmap *)cb->ctx;
> 
> if (!id) {
>   spin_lock_bh(&pernet->lock);
>   bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
>   spin_unlock_bh(&pernet->lock);
> }

Ah yes, maybe, I need to check: I thought the modification of the bitmap
was done atomically, and it was fine to read without the lock. But maybe
not because it is stored in multiple bytes.

> Also, we can put this bitmap copy code into a separate patch, which is
> applied before this one. In this patch we only need to replace
> spin_lock_bh with rcu_read_lock.

OK so we are talking about separate fix that is not directly linked to
my change, apart from the fact I modified code around, right?

Cheers,
Matt
Geliang Tang Oct. 29, 2024, 8:43 a.m. UTC | #6
Hi Matt,

On Mon, 2024-10-28 at 12:48 +0100, Matthieu Baerts wrote:
> Hi Geliang,
> 
> On 28/10/2024 03:08, Geliang Tang wrote:
> > Hi Matt,
> > 
> > On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote:
> > > Hi Geliang,
> > > 
> > > Thank you for the review!
> > > 
> > > On 25/10/2024 16:25, Geliang Tang wrote:
> > > > On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0)
> > > > wrote:
> > > > > In a few places -- to get an endpoint, dump all of them, and
> > > > > change
> > > > > their flags -- the list is iterated while holding the pernet-
> > > > > > lock,
> > > > > but
> > > > > only to read the content of the list. In these cases, we can
> > > > > replace
> > > > > the
> > > > > spin locks, by RCU read ones, and use the _rcu variants to
> > > > > iterate
> > > > > over
> > > > > the entries list in a lockless way.
> > > > > 
> > > > > To make it clear, the lookup helpers using the _rcu variant
> > > > > are
> > > > > renamed
> > > > > with a _rcu suffix. The previous __lookup_addr() helper can
> > > > > then
> > > > > be
> > > > > removed, but __lookup_addr_by_id() is still needed.
> > > > > 
> > > > > While at it, the IDs bitmap is copied before iterating the
> > > > > list
> > > > > to
> > > > > dump
> > > > > the different addresses, to avoid any consistencies.
> > > > > 
> > > > > Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> > > > > ---
> > > > > Notes:
> > > > >  - This is not a fix, a small improvement for -next.
> > > > > ---
> > > > >  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-------------
> > > > > ----
> > > > >  1 file changed, 19 insertions(+), 17 deletions(-)
> > > > > 
> > > > > diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
> > > > > index
> > > > > a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b1
> > > > > 79a8
> > > > > b50e
> > > > > 6965731542871 100644
> > > > > --- a/net/mptcp/pm_netlink.c
> > > > > +++ b/net/mptcp/pm_netlink.c
> > > 
> > > (...)
> > > 
> > > > > @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct
> > > > > sk_buff
> > > > > *msg,
> > > > >   struct net *net = sock_net(msg->sk);
> > > > >   struct mptcp_pm_addr_entry *entry;
> > > > >   struct pm_nl_pernet *pernet;
> > > > > + unsigned long id_bitmap[4];
> > > > >   int id = cb->args[0];
> > > > >   void *hdr;
> > > > >   int i;
> > > > >  
> > > > >   pernet = pm_nl_get_pernet(net);
> > > > > + bitmap_copy(id_bitmap, pernet->id_bitmap,
> > > > > MPTCP_PM_MAX_ADDR_ID + 1);
> > > > 
> > > > I think the id bitmap should only be copied when id is 0:
> > > > 
> > > > if (!id)
> > > >   bitmap_copy(id_bitmap, pernet->id_bitmap,
> > > > MPTCP_PM_MAX_ADDR_ID +
> > > > 1);
> > > > 
> > > > Since this mptcp_pm_nl_dump_addr() may be called repeatedly
> > > > when
> > > > the
> > > > dump information is very long. We only copy it the first time
> > > > it is
> > 
> > A correction is needed here. Regardless of whether the dump
> > information
> > is large or small, the dumpit function will be called repeatedly
> > when
> > dumpit returns non-zero. The loop stops when dumpit returns 0.
> > 
> > > > called, and subsequent calls cannot copy it again. WDYT?
> > > 
> > > Sorry, I'm not sure to understand. Here, I did a local copy of
> > > 'id_bitmap' just to keep a certain consistency if the bitmap is
> > > changed
> > > during the RCU read section below: not to have this bitmap being
> > > changed
> > > during the for loop here below. Before, we had this protection
> > > because
> > > pernet->lock was held.
> > 
> > It is precisely because we need to maintain this consistency that
> > we
> > need to copy the bitmap only once. If we copy the bitmap when
> > dumpit is
> > called a second time, the bitmap obtained at this time is a new
> > bitmap,
> > which destroys this consistency, right?
> 
> OK, but then I have a few questions:
> 
> - Was the code before my patch here prevented such consistency
> issues?
> Or said differently: is there a regression introduced by this patch?
> 
> - Is it really an issue that can lead to a crash or reading freed
> info?
> 
> - Where would you store the copy of the bitmap? In cb-args?
> 
> - If we store this copy somewhere, could you not have situations
> where
> the bitmap would no longer be in sync with the addresses that are
> stored
> in 'pernet->local_addr_list', and then displaying wrong/freed info?
> It
> looks like more would be needed to cope with that. And maybe that's
> not
> worth it?

Your doubts are very reasonable. It seems that I have not considered it
carefully enough. So we do need to define a local bitmap and copy it
every time. It's fine to me and it's OK to implement BPF path manager
based on it.

Thanks,
-Geliang

> 
> > > Are you suggesting here to keep a copy in the cb, not a local
> > > one, to
> > > do
> > > the copy only once? Are you sure it is worth it (and safe)?
> > 
> > Because dumpit will be called repeatedly, we need to ensure that
> > the
> > same bitmap is accessed each time, so we cannot use a local one,
> > but
> > need to keep a copy in the cb just like what we do in
> > mptcp_userspace_pm_dump_addr.
> > 
> > 
> > In addition, when doing bitmap_copy, we need to hold pernet->lock:
> > 
> > bitmap = (struct mptcp_id_bitmap *)cb->ctx;
> > 
> > if (!id) {
> >   spin_lock_bh(&pernet->lock);
> >   bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID +
> > 1);
> >   spin_unlock_bh(&pernet->lock);
> > }
> 
> Ah yes, maybe, I need to check: I thought the modification of the
> bitmap
> was done atomically, and it was fine to read without the lock. But
> maybe
> not because it is stored in multiple bytes.
> 
> > Also, we can put this bitmap copy code into a separate patch, which
> > is
> > applied before this one. In this patch we only need to replace
> > spin_lock_bh with rcu_read_lock.
> 
> OK so we are talking about separate fix that is not directly linked
> to
> my change, apart from the fact I modified code around, right?
> 
> Cheers,
> Matt
Mat Martineau Oct. 31, 2024, 10:14 p.m. UTC | #7
On Fri, 25 Oct 2024, Matthieu Baerts wrote:

> On 25/10/2024 11:32, Matthieu Baerts (NGI0) wrote:
>> In a few places -- to get an endpoint, dump all of them, and change
>> their flags -- the list is iterated while holding the pernet->lock, but
>> only to read the content of the list. In these cases, we can replace the
>> spin locks, by RCU read ones, and use the _rcu variants to iterate over
>> the entries list in a lockless way.
>>
>> To make it clear, the lookup helpers using the _rcu variant are renamed
>> with a _rcu suffix. The previous __lookup_addr() helper can then be
>> removed, but __lookup_addr_by_id() is still needed.
>>
>> While at it, the IDs bitmap is copied before iterating the list to dump
>> the different addresses, to avoid any consistencies.
>>
>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
>> ---
>> Notes:
>>  - This is not a fix, a small improvement for -next.
>> ---
>>  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-----------------
>>  1 file changed, 19 insertions(+), 17 deletions(-)
>>
>> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
>> index a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e6965731542871 100644
>> --- a/net/mptcp/pm_netlink.c
>> +++ b/net/mptcp/pm_netlink.c
>> @@ -520,12 +520,12 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
>>  }
>>
>>  static struct mptcp_pm_addr_entry *
>> -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
>> +__lookup_addr_by_id_rcu(struct pm_nl_pernet *pernet, unsigned int id)
>>  {
>>  	struct mptcp_pm_addr_entry *entry;
>>
>> -	list_for_each_entry(entry, &pernet->local_addr_list, list) {
>> -		if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
>> +	list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
>
> I didn't see the following in the RCU doc -- but it is quite big, so I
> probably missed it -- but I suppose we don't need to keep the two
> helpers here with and without the _rcu variant (__lookup_addr_by_id())
> if here we add an extra lockdep condition:
>
>  list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
>                          lockdep_is_held(&pernet->lock)) {
>
> WDYT?

I like this idea, would clean things up.

>
>> +		if (entry->addr.id == id)
>>  			return entry;
>>  	}
>>  	return NULL;
>
> (...)
>
>> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
>>  	struct net *net = sock_net(msg->sk);
>>  	struct mptcp_pm_addr_entry *entry;
>>  	struct pm_nl_pernet *pernet;
>> +	unsigned long id_bitmap[4];
>
> Oops, I left my temp declaration, it should be the following line, which
> will result in the same thing anyway on x86_64, no need to re-run the
> tests → I can fix that when applying the patches, or in a future version:
>
>  DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
>

Thanks for catching that. I think it would be best to have a v3 with the 
lockdep_is_held() approach you mentioned above.


- Mat
Mat Martineau Oct. 31, 2024, 11:24 p.m. UTC | #8
On Tue, 29 Oct 2024, Geliang Tang wrote:

> Hi Matt,
>
> On Mon, 2024-10-28 at 12:48 +0100, Matthieu Baerts wrote:
>> Hi Geliang,
>>
>> On 28/10/2024 03:08, Geliang Tang wrote:
>>> Hi Matt,
>>>
>>> On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote:
>>>> Hi Geliang,
>>>>
>>>> Thank you for the review!
>>>>
>>>> On 25/10/2024 16:25, Geliang Tang wrote:
>>>>> On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0)
>>>>> wrote:
>>>>>> In a few places -- to get an endpoint, dump all of them, and
>>>>>> change
>>>>>> their flags -- the list is iterated while holding the pernet-
>>>>>>> lock,
>>>>>> but
>>>>>> only to read the content of the list. In these cases, we can
>>>>>> replace
>>>>>> the
>>>>>> spin locks, by RCU read ones, and use the _rcu variants to
>>>>>> iterate
>>>>>> over
>>>>>> the entries list in a lockless way.
>>>>>>
>>>>>> To make it clear, the lookup helpers using the _rcu variant
>>>>>> are
>>>>>> renamed
>>>>>> with a _rcu suffix. The previous __lookup_addr() helper can
>>>>>> then
>>>>>> be
>>>>>> removed, but __lookup_addr_by_id() is still needed.
>>>>>>
>>>>>> While at it, the IDs bitmap is copied before iterating the
>>>>>> list
>>>>>> to
>>>>>> dump
>>>>>> the different addresses, to avoid any consistencies.
>>>>>>
>>>>>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
>>>>>> ---
>>>>>> Notes:
>>>>>>  - This is not a fix, a small improvement for -next.
>>>>>> ---
>>>>>>  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-------------
>>>>>> ----
>>>>>>  1 file changed, 19 insertions(+), 17 deletions(-)
>>>>>>
>>>>>> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
>>>>>> index
>>>>>> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b1
>>>>>> 79a8
>>>>>> b50e
>>>>>> 6965731542871 100644
>>>>>> --- a/net/mptcp/pm_netlink.c
>>>>>> +++ b/net/mptcp/pm_netlink.c
>>>>
>>>> (...)
>>>>
>>>>>> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct
>>>>>> sk_buff
>>>>>> *msg,
>>>>>>   struct net *net = sock_net(msg->sk);
>>>>>>   struct mptcp_pm_addr_entry *entry;
>>>>>>   struct pm_nl_pernet *pernet;
>>>>>> + unsigned long id_bitmap[4];
>>>>>>   int id = cb->args[0];
>>>>>>   void *hdr;
>>>>>>   int i;
>>>>>>
>>>>>   pernet = pm_nl_get_pernet(net);
>>>>>> + bitmap_copy(id_bitmap, pernet->id_bitmap,
>>>>>> MPTCP_PM_MAX_ADDR_ID + 1);
>>>>>
>>>>> I think the id bitmap should only be copied when id is 0:
>>>>>
>>>>> if (!id)
>>>>>   bitmap_copy(id_bitmap, pernet->id_bitmap,
>>>>> MPTCP_PM_MAX_ADDR_ID +
>>>>> 1);
>>>>>
>>>>> Since this mptcp_pm_nl_dump_addr() may be called repeatedly
>>>>> when
>>>>> the
>>>>> dump information is very long. We only copy it the first time
>>>>> it is
>>>
>>> A correction is needed here. Regardless of whether the dump
>>> information
>>> is large or small, the dumpit function will be called repeatedly
>>> when
>>> dumpit returns non-zero. The loop stops when dumpit returns 0.
>>>
>>>>> called, and subsequent calls cannot copy it again. WDYT?
>>>>
>>>> Sorry, I'm not sure to understand. Here, I did a local copy of
>>>> 'id_bitmap' just to keep a certain consistency if the bitmap is
>>>> changed
>>>> during the RCU read section below: not to have this bitmap being
>>>> changed
>>>> during the for loop here below. Before, we had this protection
>>>> because
>>>> pernet->lock was held.
>>>
>>> It is precisely because we need to maintain this consistency that
>>> we
>>> need to copy the bitmap only once. If we copy the bitmap when
>>> dumpit is
>>> called a second time, the bitmap obtained at this time is a new
>>> bitmap,
>>> which destroys this consistency, right?
>>
>> OK, but then I have a few questions:
>>
>> - Was the code before my patch here prevented such consistency
>> issues?
>> Or said differently: is there a regression introduced by this patch?
>>
>> - Is it really an issue that can lead to a crash or reading freed
>> info?
>>
>> - Where would you store the copy of the bitmap? In cb-args?
>>
>> - If we store this copy somewhere, could you not have situations
>> where
>> the bitmap would no longer be in sync with the addresses that are
>> stored
>> in 'pernet->local_addr_list', and then displaying wrong/freed info?
>> It
>> looks like more would be needed to cope with that. And maybe that's
>> not
>> worth it?
>
> Your doubts are very reasonable. It seems that I have not considered it
> carefully enough. So we do need to define a local bitmap and copy it
> every time. It's fine to me and it's OK to implement BPF path manager
> based on it.
>

Matthieu and Geliang -

I think the only way to guarantee strict consistency would be to acquire 
the lock at the beginning of the dump and make a complete copy of both the 
bitmap and the address list at that moment. Even with all the work to 
achieve this "strictly consistent" snapshot, the userspace caller doesn't 
have any guarantee that the dumped data is accurate at the moment the 
netlink response completes.


The good news is that I don't think this strict consistency is needed for 
the dump function. If the list is being modified concurrently, it's not a 
big problem to miss newly-added entries or show recently-deleted ones.


Since bitmap_copy() is non-atomic, my suggestion is to not copy the 
bitmap. Instead use test_bit() (which is atomic) directly on 
pernet->id_bitmap without locking.

Then, make sure that inconsistencies between the bitmap and the 
local_addr_list are handled cleanly. The two cases are:

1. Bitmap has a 0 even though there's a new entry in local_addr_list. In 
this case it gets skipped. No problem here.

2. Bitmap has a 1 even though there's no entry in local_addr_list (ignore 
it!), or there's an entry that is in the process of getting freed. For the 
latter, RCU will ensure there's no risk of an invalid dereference. If we 
want to detect a deletion-in-progress, we would have to add an atomic flag 
to 'struct mptcp_pm_addr_entry' that is set before the deletion code calls 
list_del_rcu(). Even if the address is in this state, I would argue that 
the just-barely-stale data is fine to show.


Sound reasonable? Do you see a reason that bitmap/addr_list 
inconsistencies would cause issues?

Thanks

- Mat


>>
>>>> Are you suggesting here to keep a copy in the cb, not a local
>>>> one, to
>>>> do
>>>> the copy only once? Are you sure it is worth it (and safe)?
>>>
>>> Because dumpit will be called repeatedly, we need to ensure that
>>> the
>>> same bitmap is accessed each time, so we cannot use a local one,
>>> but
>>> need to keep a copy in the cb just like what we do in
>>> mptcp_userspace_pm_dump_addr.
>>>
>>>
>>> In addition, when doing bitmap_copy, we need to hold pernet->lock:
>>>
>>> bitmap = (struct mptcp_id_bitmap *)cb->ctx;
>>>
>>> if (!id) {
>>>   spin_lock_bh(&pernet->lock);
>>>   bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID +
>>> 1);
>>>   spin_unlock_bh(&pernet->lock);
>>> }
>>
>> Ah yes, maybe, I need to check: I thought the modification of the
>> bitmap
>> was done atomically, and it was fine to read without the lock. But
>> maybe
>> not because it is stored in multiple bytes.



>>
>>> Also, we can put this bitmap copy code into a separate patch, which
>>> is
>>> applied before this one. In this patch we only need to replace
>>> spin_lock_bh with rcu_read_lock.
>>
>> OK so we are talking about separate fix that is not directly linked
>> to
>> my change, apart from the fact I modified code around, right?
>>
>> Cheers,
>> Matt
>
>
>
Paolo Abeni Nov. 5, 2024, 6:21 p.m. UTC | #9
On 10/25/24 11:32, Matthieu Baerts (NGI0) wrote:
> @@ -2060,17 +2062,17 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info)
>  	if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
>  		bkup = 1;
>  
> -	spin_lock_bh(&pernet->lock);
> -	entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) :
> -			       __lookup_addr(pernet, &addr.addr);
> +	rcu_read_lock();
> +	entry = lookup_by_id ? __lookup_addr_by_id_rcu(pernet, addr.addr.id) :
> +			       __lookup_addr_rcu(pernet, &addr.addr);
>  	if (!entry) {
> -		spin_unlock_bh(&pernet->lock);
> +		rcu_read_unlock();
>  		GENL_SET_ERR_MSG(info, "address not found");
>  		return -EINVAL;
>  	}
>  	if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
>  	    (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
> -		spin_unlock_bh(&pernet->lock);
> +		rcu_read_unlock();
>  		GENL_SET_ERR_MSG(info, "invalid addr flags");
>  		return -EINVAL;
>  	}
> @@ -2078,7 +2080,7 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info)
>  	changed = (addr.flags ^ entry->flags) & mask;
>  	entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
>  	addr = *entry;
> -	spin_unlock_bh(&pernet->lock);
> +	rcu_read_unlock();
>  
>  	mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
>  	return 0;
> 

I think we must retain the lock in this function, otherwise we could
end-up with an unexpected flag combination.

i.e. set_flag(MPTCP_PM_ADDR_FLAG_FULLMESH) and
set_flag(MPTCP_PM_ADDR_FLAG_SIGNAL) run concurrently on CPU1 and CPU2.
entry->flags could end-up having a single bit set instead of both, if
both CPUs read the entry before the other would store the new value.

I don't think _ONCE() annotations will be enough to avoid such thing
without a lock.

Cheers,

Paolo
Matthieu Baerts Nov. 6, 2024, 4:03 p.m. UTC | #10
Hi Mat,

On 01/11/2024 00:24, Mat Martineau wrote:
> On Tue, 29 Oct 2024, Geliang Tang wrote:
> 
>> Hi Matt,
>>
>> On Mon, 2024-10-28 at 12:48 +0100, Matthieu Baerts wrote:
>>> Hi Geliang,
>>>
>>> On 28/10/2024 03:08, Geliang Tang wrote:
>>>> Hi Matt,
>>>>
>>>> On Fri, 2024-10-25 at 17:26 +0200, Matthieu Baerts wrote:
>>>>> Hi Geliang,
>>>>>
>>>>> Thank you for the review!
>>>>>
>>>>> On 25/10/2024 16:25, Geliang Tang wrote:
>>>>>> On Fri, 2024-10-25 at 11:32 +0200, Matthieu Baerts (NGI0)
>>>>>> wrote:
>>>>>>> In a few places -- to get an endpoint, dump all of them, and
>>>>>>> change
>>>>>>> their flags -- the list is iterated while holding the pernet-
>>>>>>>> lock,
>>>>>>> but
>>>>>>> only to read the content of the list. In these cases, we can
>>>>>>> replace
>>>>>>> the
>>>>>>> spin locks, by RCU read ones, and use the _rcu variants to
>>>>>>> iterate
>>>>>>> over
>>>>>>> the entries list in a lockless way.
>>>>>>>
>>>>>>> To make it clear, the lookup helpers using the _rcu variant
>>>>>>> are
>>>>>>> renamed
>>>>>>> with a _rcu suffix. The previous __lookup_addr() helper can
>>>>>>> then
>>>>>>> be
>>>>>>> removed, but __lookup_addr_by_id() is still needed.
>>>>>>>
>>>>>>> While at it, the IDs bitmap is copied before iterating the
>>>>>>> list
>>>>>>> to
>>>>>>> dump
>>>>>>> the different addresses, to avoid any consistencies.
>>>>>>>
>>>>>>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
>>>>>>> ---
>>>>>>> Notes:
>>>>>>>  - This is not a fix, a small improvement for -next.
>>>>>>> ---
>>>>>>>  net/mptcp/pm_netlink.c | 36 +++++++++++++++++++-------------
>>>>>>> ----
>>>>>>>  1 file changed, 19 insertions(+), 17 deletions(-)
>>>>>>>
>>>>>>> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
>>>>>>> index
>>>>>>> a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b1
>>>>>>> 79a8
>>>>>>> b50e
>>>>>>> 6965731542871 100644
>>>>>>> --- a/net/mptcp/pm_netlink.c
>>>>>>> +++ b/net/mptcp/pm_netlink.c
>>>>>
>>>>> (...)
>>>>>
>>>>>>> @@ -1872,16 +1872,18 @@ int mptcp_pm_nl_dump_addr(struct
>>>>>>> sk_buff
>>>>>>> *msg,
>>>>>>>   struct net *net = sock_net(msg->sk);
>>>>>>>   struct mptcp_pm_addr_entry *entry;
>>>>>>>   struct pm_nl_pernet *pernet;
>>>>>>> + unsigned long id_bitmap[4];
>>>>>>>   int id = cb->args[0];
>>>>>>>   void *hdr;
>>>>>>>   int i;
>>>>>>>
>>>>>>   pernet = pm_nl_get_pernet(net);
>>>>>>> + bitmap_copy(id_bitmap, pernet->id_bitmap,
>>>>>>> MPTCP_PM_MAX_ADDR_ID + 1);
>>>>>>
>>>>>> I think the id bitmap should only be copied when id is 0:
>>>>>>
>>>>>> if (!id)
>>>>>>   bitmap_copy(id_bitmap, pernet->id_bitmap,
>>>>>> MPTCP_PM_MAX_ADDR_ID +
>>>>>> 1);
>>>>>>
>>>>>> Since this mptcp_pm_nl_dump_addr() may be called repeatedly
>>>>>> when
>>>>>> the
>>>>>> dump information is very long. We only copy it the first time
>>>>>> it is
>>>>
>>>> A correction is needed here. Regardless of whether the dump
>>>> information
>>>> is large or small, the dumpit function will be called repeatedly
>>>> when
>>>> dumpit returns non-zero. The loop stops when dumpit returns 0.
>>>>
>>>>>> called, and subsequent calls cannot copy it again. WDYT?
>>>>>
>>>>> Sorry, I'm not sure to understand. Here, I did a local copy of
>>>>> 'id_bitmap' just to keep a certain consistency if the bitmap is
>>>>> changed
>>>>> during the RCU read section below: not to have this bitmap being
>>>>> changed
>>>>> during the for loop here below. Before, we had this protection
>>>>> because
>>>>> pernet->lock was held.
>>>>
>>>> It is precisely because we need to maintain this consistency that
>>>> we
>>>> need to copy the bitmap only once. If we copy the bitmap when
>>>> dumpit is
>>>> called a second time, the bitmap obtained at this time is a new
>>>> bitmap,
>>>> which destroys this consistency, right?
>>>
>>> OK, but then I have a few questions:
>>>
>>> - Was the code before my patch here prevented such consistency
>>> issues?
>>> Or said differently: is there a regression introduced by this patch?
>>>
>>> - Is it really an issue that can lead to a crash or reading freed
>>> info?
>>>
>>> - Where would you store the copy of the bitmap? In cb-args?
>>>
>>> - If we store this copy somewhere, could you not have situations
>>> where
>>> the bitmap would no longer be in sync with the addresses that are
>>> stored
>>> in 'pernet->local_addr_list', and then displaying wrong/freed info?
>>> It
>>> looks like more would be needed to cope with that. And maybe that's
>>> not
>>> worth it?
>>
>> Your doubts are very reasonable. It seems that I have not considered it
>> carefully enough. So we do need to define a local bitmap and copy it
>> every time. It's fine to me and it's OK to implement BPF path manager
>> based on it.
>>
> 
> Matthieu and Geliang -
> 
> I think the only way to guarantee strict consistency would be to acquire
> the lock at the beginning of the dump and make a complete copy of both
> the bitmap and the address list at that moment. Even with all the work
> to achieve this "strictly consistent" snapshot, the userspace caller
> doesn't have any guarantee that the dumped data is accurate at the
> moment the netlink response completes.
> 
> 
> The good news is that I don't think this strict consistency is needed
> for the dump function. If the list is being modified concurrently, it's
> not a big problem to miss newly-added entries or show recently-deleted
> ones.
> 
> 
> Since bitmap_copy() is non-atomic, my suggestion is to not copy the
> bitmap. Instead use test_bit() (which is atomic) directly on pernet-
>>id_bitmap without locking.
> 
> Then, make sure that inconsistencies between the bitmap and the
> local_addr_list are handled cleanly. The two cases are:
> 
> 1. Bitmap has a 0 even though there's a new entry in local_addr_list. In
> this case it gets skipped. No problem here.
> 
> 2. Bitmap has a 1 even though there's no entry in local_addr_list
> (ignore it!), or there's an entry that is in the process of getting
> freed. For the latter, RCU will ensure there's no risk of an invalid
> dereference. If we want to detect a deletion-in-progress, we would have
> to add an atomic flag to 'struct mptcp_pm_addr_entry' that is set before
> the deletion code calls list_del_rcu(). Even if the address is in this
> state, I would argue that the just-barely-stale data is fine to show.

Thank you for your review!

> Sound reasonable? Do you see a reason that bitmap/addr_list
> inconsistencies would cause issues?
Yes, that's sounds reasonable, as long as we don't try to access freed
resources, I don't think we need to worry about inconsistencies if there
are ongoing changes while the userspace is requesting the list.

When I did the copy of the bitmap, it was mainly "as a matter of
principle", but if it is not needed, that's even simpler.

Cheers,
Matt
Matthieu Baerts Nov. 6, 2024, 4:23 p.m. UTC | #11
Hi Paolo,

Thank you for the review!

On 05/11/2024 19:21, Paolo Abeni wrote:
> On 10/25/24 11:32, Matthieu Baerts (NGI0) wrote:
>> @@ -2060,17 +2062,17 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info)
>>  	if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
>>  		bkup = 1;
>>  
>> -	spin_lock_bh(&pernet->lock);
>> -	entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) :
>> -			       __lookup_addr(pernet, &addr.addr);
>> +	rcu_read_lock();
>> +	entry = lookup_by_id ? __lookup_addr_by_id_rcu(pernet, addr.addr.id) :
>> +			       __lookup_addr_rcu(pernet, &addr.addr);
>>  	if (!entry) {
>> -		spin_unlock_bh(&pernet->lock);
>> +		rcu_read_unlock();
>>  		GENL_SET_ERR_MSG(info, "address not found");
>>  		return -EINVAL;
>>  	}
>>  	if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
>>  	    (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
>> -		spin_unlock_bh(&pernet->lock);
>> +		rcu_read_unlock();
>>  		GENL_SET_ERR_MSG(info, "invalid addr flags");
>>  		return -EINVAL;
>>  	}
>> @@ -2078,7 +2080,7 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info)
>>  	changed = (addr.flags ^ entry->flags) & mask;
>>  	entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
>>  	addr = *entry;
>> -	spin_unlock_bh(&pernet->lock);
>> +	rcu_read_unlock();
>>  
>>  	mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
>>  	return 0;
>>
> 
> I think we must retain the lock in this function, otherwise we could
> end-up with an unexpected flag combination.
> 
> i.e. set_flag(MPTCP_PM_ADDR_FLAG_FULLMESH) and
> set_flag(MPTCP_PM_ADDR_FLAG_SIGNAL) run concurrently on CPU1 and CPU2.
> entry->flags could end-up having a single bit set instead of both, if
> both CPUs read the entry before the other would store the new value.

Good point, I didn't think about this case where two commands would be
done in parallel from the userspace. But can we get this case? By
default, are the commands not serialised? (linked to parallel_ops member
in struct genl_family?)

In this case, is it OK to switch to rcu_read_lock? Or is it maybe not
worth it?

> I don't think _ONCE() annotations will be enough to avoid such thing
> without a lock.

Cheers,
Matt
diff mbox series

Patch

diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index a93b9b7776b48781a883673fe5fd521a978487ff..f38e1ccd34e95cd88b179a8b50e6965731542871 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -520,12 +520,12 @@  __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
 }
 
 static struct mptcp_pm_addr_entry *
-__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
+__lookup_addr_by_id_rcu(struct pm_nl_pernet *pernet, unsigned int id)
 {
 	struct mptcp_pm_addr_entry *entry;
 
-	list_for_each_entry(entry, &pernet->local_addr_list, list) {
-		if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
+	list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+		if (entry->addr.id == id)
 			return entry;
 	}
 	return NULL;
@@ -1836,8 +1836,8 @@  int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info)
 		goto fail;
 	}
 
-	spin_lock_bh(&pernet->lock);
-	entry = __lookup_addr_by_id(pernet, addr.addr.id);
+	rcu_read_lock();
+	entry = __lookup_addr_by_id_rcu(pernet, addr.addr.id);
 	if (!entry) {
 		GENL_SET_ERR_MSG(info, "address not found");
 		ret = -EINVAL;
@@ -1850,11 +1850,11 @@  int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info)
 
 	genlmsg_end(msg, reply);
 	ret = genlmsg_reply(msg, info);
-	spin_unlock_bh(&pernet->lock);
+	rcu_read_unlock();
 	return ret;
 
 unlock_fail:
-	spin_unlock_bh(&pernet->lock);
+	rcu_read_unlock();
 
 fail:
 	nlmsg_free(msg);
@@ -1872,16 +1872,18 @@  int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
 	struct net *net = sock_net(msg->sk);
 	struct mptcp_pm_addr_entry *entry;
 	struct pm_nl_pernet *pernet;
+	unsigned long id_bitmap[4];
 	int id = cb->args[0];
 	void *hdr;
 	int i;
 
 	pernet = pm_nl_get_pernet(net);
+	bitmap_copy(id_bitmap, pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
 
-	spin_lock_bh(&pernet->lock);
+	rcu_read_lock();
 	for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
-		if (test_bit(i, pernet->id_bitmap)) {
-			entry = __lookup_addr_by_id(pernet, i);
+		if (test_bit(i, id_bitmap)) {
+			entry = __lookup_addr_by_id_rcu(pernet, i);
 			if (!entry)
 				break;
 
@@ -1903,7 +1905,7 @@  int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
 			genlmsg_end(msg, hdr);
 		}
 	}
-	spin_unlock_bh(&pernet->lock);
+	rcu_read_unlock();
 
 	cb->args[0] = id;
 	return msg->len;
@@ -2060,17 +2062,17 @@  int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info)
 	if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
 		bkup = 1;
 
-	spin_lock_bh(&pernet->lock);
-	entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) :
-			       __lookup_addr(pernet, &addr.addr);
+	rcu_read_lock();
+	entry = lookup_by_id ? __lookup_addr_by_id_rcu(pernet, addr.addr.id) :
+			       __lookup_addr_rcu(pernet, &addr.addr);
 	if (!entry) {
-		spin_unlock_bh(&pernet->lock);
+		rcu_read_unlock();
 		GENL_SET_ERR_MSG(info, "address not found");
 		return -EINVAL;
 	}
 	if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
 	    (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
-		spin_unlock_bh(&pernet->lock);
+		rcu_read_unlock();
 		GENL_SET_ERR_MSG(info, "invalid addr flags");
 		return -EINVAL;
 	}
@@ -2078,7 +2080,7 @@  int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info)
 	changed = (addr.flags ^ entry->flags) & mask;
 	entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
 	addr = *entry;
-	spin_unlock_bh(&pernet->lock);
+	rcu_read_unlock();
 
 	mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
 	return 0;