diff mbox series

[PATCHv4,net,1/3] bonding: move IPsec deletion to bond_ipsec_free_sa

Message ID 20250304131120.31135-2-liuhangbin@gmail.com (mailing list archive)
State New
Headers show
Series bond: fix xfrm offload issues | expand

Commit Message

Hangbin Liu March 4, 2025, 1:11 p.m. UTC
The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
a warning:

  BUG: sleeping function called from invalid context at...

Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa,
which is not held by spin_lock_bh().

Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the
XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered
again in bond_ipsec_free_sa().

For bond_ipsec_free_sa(), there are now three conditions:

  1. if (!slave): When no active device exists.
  2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
  3. if (xs->xso.real_dev != real_dev): When an xs has already been freed
     by bond_ipsec_del_sa_all() due to migration, and the active slave has
     changed to a new device. At the same time, the xs is marked as DEAD
     due to the XFRM entry is removed, triggering xfrm_state_gc_task() and
     bond_ipsec_free_sa().

In all three cases, xdo_dev_state_free() should not be called, only xs
should be removed from bond->ipsec list.

At the same time, protect bond_ipsec_del_sa_all and bond_ipsec_add_sa_all
with x->lock for each xs being processed. This prevents XFRM from
concurrently initiating add/delete operations on the managed states.

Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
Suggested-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 drivers/net/bonding/bond_main.c | 53 +++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 16 deletions(-)

Comments

Nikolay Aleksandrov March 5, 2025, 8:38 a.m. UTC | #1
On 3/4/25 15:11, Hangbin Liu wrote:
> The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
> a warning:
> 
>   BUG: sleeping function called from invalid context at...
> 
> Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa,
> which is not held by spin_lock_bh().
> 
> Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the
> XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered
> again in bond_ipsec_free_sa().
> 
> For bond_ipsec_free_sa(), there are now three conditions:
> 
>   1. if (!slave): When no active device exists.
>   2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
>   3. if (xs->xso.real_dev != real_dev): When an xs has already been freed
>      by bond_ipsec_del_sa_all() due to migration, and the active slave has
>      changed to a new device. At the same time, the xs is marked as DEAD
>      due to the XFRM entry is removed, triggering xfrm_state_gc_task() and
>      bond_ipsec_free_sa().
> 
> In all three cases, xdo_dev_state_free() should not be called, only xs
> should be removed from bond->ipsec list.
> 
> At the same time, protect bond_ipsec_del_sa_all and bond_ipsec_add_sa_all
> with x->lock for each xs being processed. This prevents XFRM from
> concurrently initiating add/delete operations on the managed states.
> 
> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
> Suggested-by: Cosmin Ratiu <cratiu@nvidia.com>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
>  drivers/net/bonding/bond_main.c | 53 +++++++++++++++++++++++----------
>  1 file changed, 37 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index e45bba240cbc..06b060d9b031 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -537,15 +537,22 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
>  	}
>  
>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> +		spin_lock_bh(&ipsec->xs->lock);
> +		/* Skip dead xfrm states, they'll be freed later. */
> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> +			goto next;
> +
>  		/* If new state is added before ipsec_lock acquired */
>  		if (ipsec->xs->xso.real_dev == real_dev)
> -			continue;
> +			goto next;
>  
>  		ipsec->xs->xso.real_dev = real_dev;
>  		if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
>  			slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
>  			ipsec->xs->xso.real_dev = NULL;
>  		}
> +next:
> +		spin_unlock_bh(&ipsec->xs->lock);
>  	}
>  out:
>  	mutex_unlock(&bond->ipsec_lock);
> @@ -560,7 +567,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>  	struct net_device *bond_dev = xs->xso.dev;
>  	struct net_device *real_dev;
>  	netdevice_tracker tracker;
> -	struct bond_ipsec *ipsec;
>  	struct bonding *bond;
>  	struct slave *slave;
>  
> @@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>  	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
>  out:
>  	netdev_put(real_dev, &tracker);
> -	mutex_lock(&bond->ipsec_lock);
> -	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> -		if (ipsec->xs == xs) {
> -			list_del(&ipsec->list);
> -			kfree(ipsec);
> -			break;
> -		}
> -	}
> -	mutex_unlock(&bond->ipsec_lock);
>  }
>  
>  static void bond_ipsec_del_sa_all(struct bonding *bond)
> @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
>  
>  	mutex_lock(&bond->ipsec_lock);
>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {

Second time - you should use list_for_each_entry_safe if you're walking and deleting
elements from the list.

> +		spin_lock_bh(&ipsec->xs->lock);
>  		if (!ipsec->xs->xso.real_dev)
> -			continue;
> +			goto next;
> +
> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> +			/* already dead no need to delete again */
> +			if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> +				real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);

Have you checked if .xdo_dev_state_free can sleep?
I see at least one that can: mlx5e_xfrm_free_state().

> +			list_del(&ipsec->list);
> +			kfree(ipsec);
> +			goto next;
> +		}
>  
>  		if (!real_dev->xfrmdev_ops ||
>  		    !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> @@ -631,6 +638,8 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
>  			if (real_dev->xfrmdev_ops->xdo_dev_state_free)
>  				real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
>  		}
> +next:
> +		spin_unlock_bh(&ipsec->xs->lock);
>  	}
>  	mutex_unlock(&bond->ipsec_lock);
>  }
> @@ -640,6 +649,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
>  	struct net_device *bond_dev = xs->xso.dev;
>  	struct net_device *real_dev;
>  	netdevice_tracker tracker;
> +	struct bond_ipsec *ipsec;
>  	struct bonding *bond;
>  	struct slave *slave;
>  
> @@ -659,11 +669,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
>  	if (!xs->xso.real_dev)
>  		goto out;
>  
> -	WARN_ON(xs->xso.real_dev != real_dev);
> +	mutex_lock(&bond->ipsec_lock);
> +	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> +		if (ipsec->xs == xs) {
> +			/* do xdo_dev_state_free if real_dev matches,
> +			 * otherwise only remove the list
> +			 */
> +			if (real_dev && real_dev->xfrmdev_ops &&
> +			    real_dev->xfrmdev_ops->xdo_dev_state_free)
> +				real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
> +			list_del(&ipsec->list);
> +			kfree(ipsec);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&bond->ipsec_lock);
>  
> -	if (real_dev && real_dev->xfrmdev_ops &&
> -	    real_dev->xfrmdev_ops->xdo_dev_state_free)
> -		real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
>  out:
>  	netdev_put(real_dev, &tracker);
>  }
Hangbin Liu March 5, 2025, 2:13 p.m. UTC | #2
On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote:
> > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
> >  
> >  	mutex_lock(&bond->ipsec_lock);
> >  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> 
> Second time - you should use list_for_each_entry_safe if you're walking and deleting
> elements from the list.

Sorry, I missed this comment. I will update in next version.

> 
> > +		spin_lock_bh(&ipsec->xs->lock);
> >  		if (!ipsec->xs->xso.real_dev)
> > -			continue;
> > +			goto next;
> > +
> > +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> > +			/* already dead no need to delete again */
> > +			if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> > +				real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
> 
> Have you checked if .xdo_dev_state_free can sleep?
> I see at least one that can: mlx5e_xfrm_free_state().

Hmm, This brings us back to the initial problem. We tried to avoid calling
a spin lock in a sleep context (bond_ipsec_del_sa), but now the new code
encounters this issue again.

With your reply, I also checked the xdo_dev_state_add() in
bond_ipsec_add_sa_all(), which may also sleep, e.g. mlx5e_xfrm_add_state(),

If we unlock the spin lock, then the race came back again.

Any idea about this?

thanks
Hangbin
Cosmin Ratiu March 5, 2025, 4:12 p.m. UTC | #3
On Wed, 2025-03-05 at 14:13 +0000, Hangbin Liu wrote:
> On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote:
> > > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct
> > > bonding *bond)
> > >  
> > >  	mutex_lock(&bond->ipsec_lock);
> > >  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > 
> > Second time - you should use list_for_each_entry_safe if you're
> > walking and deleting
> > elements from the list.
> 
> Sorry, I missed this comment. I will update in next version.
> 
> > 
> > > +		spin_lock_bh(&ipsec->xs->lock);
> > >  		if (!ipsec->xs->xso.real_dev)
> > > -			continue;
> > > +			goto next;
> > > +
> > > +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> > > +			/* already dead no need to delete again
> > > */
> > > +			if (real_dev->xfrmdev_ops-
> > > >xdo_dev_state_free)
> > > +				real_dev->xfrmdev_ops-
> > > >xdo_dev_state_free(ipsec->xs);
> > 
> > Have you checked if .xdo_dev_state_free can sleep?
> > I see at least one that can: mlx5e_xfrm_free_state().
> 
> Hmm, This brings us back to the initial problem. We tried to avoid
> calling
> a spin lock in a sleep context (bond_ipsec_del_sa), but now the new
> code
> encounters this issue again.

The reason the mutex was added (instead of the spinlock used before)
was exactly because the add and free offload operations could sleep.

> With your reply, I also checked the xdo_dev_state_add() in
> bond_ipsec_add_sa_all(), which may also sleep, e.g.
> mlx5e_xfrm_add_state(),
> 
> If we unlock the spin lock, then the race came back again.
> 
> Any idea about this?

The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus
bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all
releases x->lock, bond_ipsec_del_sa can immediately be called, followed
by bond_ipsec_free_sa.
Maybe dropping x->lock after setting real_dev to NULL? I checked,
real_dev is not used anywhere on the free calls, I think. I have
another series refactoring things around real_dev, I hope to be able to
send it soon.

Here's a sketch of this idea:

--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding
*bond)
 
        mutex_lock(&bond->ipsec_lock);
        list_for_each_entry(ipsec, &bond->ipsec_list, list) {
-               if (!ipsec->xs->xso.real_dev)
+               spin_lock(&ipsec->x->lock);
+               if (!ipsec->xs->xso.real_dev) {
+                       spin_unlock(&ipsec->x->lock);
                        continue;
+               }
 
                if (!real_dev->xfrmdev_ops ||
                    !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
@@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding
*bond)
                        slave_warn(bond_dev, real_dev,
                                   "%s: no slave
xdo_dev_state_delete\n",
                                   __func__);
-               } else {
-                       real_dev->xfrmdev_ops-
>xdo_dev_state_delete(real_dev, ipsec->xs);
-                       if (real_dev->xfrmdev_ops->xdo_dev_state_free)
-                               real_dev->xfrmdev_ops-
>xdo_dev_state_free(ipsec->xs);
-                       ipsec->xs->xso.real_dev = NULL;
+                       spin_unlock(&ipsec->x->lock);
+                       continue;
                }
+
+               real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
ipsec->xs);
+               ipsec->xs->xso.real_dev = NULL;
+               /* Unlock before freeing device state, it could sleep.
*/
+               spin_unlock(&ipsec->x->lock);
+               if (real_dev->xfrmdev_ops->xdo_dev_state_free)
+                       real_dev->xfrmdev_ops-
>xdo_dev_state_free(ipsec->xs);

Cosmin.
Hangbin Liu March 6, 2025, 9:37 a.m. UTC | #4
On Wed, Mar 05, 2025 at 04:12:18PM +0000, Cosmin Ratiu wrote:
> On Wed, 2025-03-05 at 14:13 +0000, Hangbin Liu wrote:
> > On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote:
> > > > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct
> > > > bonding *bond)
> > > >  
> > > >  	mutex_lock(&bond->ipsec_lock);
> > > >  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > > 
> > > Second time - you should use list_for_each_entry_safe if you're
> > > walking and deleting
> > > elements from the list.
> > 
> > Sorry, I missed this comment. I will update in next version.
> > 
> > > 
> > > > +		spin_lock_bh(&ipsec->xs->lock);
> > > >  		if (!ipsec->xs->xso.real_dev)
> > > > -			continue;
> > > > +			goto next;
> > > > +
> > > > +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> > > > +			/* already dead no need to delete again
> > > > */
> > > > +			if (real_dev->xfrmdev_ops-
> > > > >xdo_dev_state_free)
> > > > +				real_dev->xfrmdev_ops-
> > > > >xdo_dev_state_free(ipsec->xs);
> > > 
> > > Have you checked if .xdo_dev_state_free can sleep?
> > > I see at least one that can: mlx5e_xfrm_free_state().
> > 
> > Hmm, This brings us back to the initial problem. We tried to avoid
> > calling
> > a spin lock in a sleep context (bond_ipsec_del_sa), but now the new
> > code
> > encounters this issue again.
> 
> The reason the mutex was added (instead of the spinlock used before)
> was exactly because the add and free offload operations could sleep.
> 
> > With your reply, I also checked the xdo_dev_state_add() in
> > bond_ipsec_add_sa_all(), which may also sleep, e.g.
> > mlx5e_xfrm_add_state(),
> > 
> > If we unlock the spin lock, then the race came back again.
> > 
> > Any idea about this?
> 
> The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus
> bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all
> releases x->lock, bond_ipsec_del_sa can immediately be called, followed
> by bond_ipsec_free_sa.
> Maybe dropping x->lock after setting real_dev to NULL? I checked,
> real_dev is not used anywhere on the free calls, I think. I have
> another series refactoring things around real_dev, I hope to be able to
> send it soon.
> 
> Here's a sketch of this idea:
> 
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>  
>         mutex_lock(&bond->ipsec_lock);
>         list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> -               if (!ipsec->xs->xso.real_dev)
> +               spin_lock(&ipsec->x->lock);
> +               if (!ipsec->xs->xso.real_dev) {
> +                       spin_unlock(&ipsec->x->lock);
>                         continue;
> +               }
>  
>                 if (!real_dev->xfrmdev_ops ||
>                     !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>                         slave_warn(bond_dev, real_dev,
>                                    "%s: no slave
> xdo_dev_state_delete\n",
>                                    __func__);
> -               } else {
> -                       real_dev->xfrmdev_ops-
> >xdo_dev_state_delete(real_dev, ipsec->xs);
> -                       if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> -                               real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
> -                       ipsec->xs->xso.real_dev = NULL;
> +                       spin_unlock(&ipsec->x->lock);
> +                       continue;
>                 }
> +
> +               real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
> ipsec->xs);
> +               ipsec->xs->xso.real_dev = NULL;

Set xs->xso.real_dev = NULL is a good idea. As we will break
in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no
xs->xso.real_dev.

For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev
after .xdo_dev_state_add() in case the following situation.

bond_ipsec_add_sa_all()
spin_unlock(&ipsec->x->lock);
ipsec->xs->xso.real_dev = real_dev;
                                           __xfrm_state_delete x->state = DEAD
                                              - bond_ipsec_del_sa()
                                                - .xdo_dev_state_delete()
.xdo_dev_state_add()

Thanks
Hangbin

> +               /* Unlock before freeing device state, it could sleep.
> */
> +               spin_unlock(&ipsec->x->lock);
> +               if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> +                       real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
> 
> Cosmin.
Hangbin Liu March 6, 2025, 10:02 a.m. UTC | #5
On Thu, Mar 06, 2025 at 09:37:53AM +0000, Hangbin Liu wrote:
> > 
> > The reason the mutex was added (instead of the spinlock used before)
> > was exactly because the add and free offload operations could sleep.
> > 
> > > With your reply, I also checked the xdo_dev_state_add() in
> > > bond_ipsec_add_sa_all(), which may also sleep, e.g.
> > > mlx5e_xfrm_add_state(),
> > > 
> > > If we unlock the spin lock, then the race came back again.
> > > 
> > > Any idea about this?
> > 
> > The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus
> > bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all
> > releases x->lock, bond_ipsec_del_sa can immediately be called, followed
> > by bond_ipsec_free_sa.
> > Maybe dropping x->lock after setting real_dev to NULL? I checked,
> > real_dev is not used anywhere on the free calls, I think. I have
> > another series refactoring things around real_dev, I hope to be able to
> > send it soon.
> > 
> > Here's a sketch of this idea:
> > 
> > --- a/drivers/net/bonding/bond_main.c
> > +++ b/drivers/net/bonding/bond_main.c
> > @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding
> > *bond)
> >  
> >         mutex_lock(&bond->ipsec_lock);
> >         list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > -               if (!ipsec->xs->xso.real_dev)
> > +               spin_lock(&ipsec->x->lock);
> > +               if (!ipsec->xs->xso.real_dev) {
> > +                       spin_unlock(&ipsec->x->lock);
> >                         continue;
> > +               }
> >  
> >                 if (!real_dev->xfrmdev_ops ||
> >                     !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> > @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding
> > *bond)
> >                         slave_warn(bond_dev, real_dev,
> >                                    "%s: no slave
> > xdo_dev_state_delete\n",
> >                                    __func__);
> > -               } else {
> > -                       real_dev->xfrmdev_ops-
> > >xdo_dev_state_delete(real_dev, ipsec->xs);
> > -                       if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> > -                               real_dev->xfrmdev_ops-
> > >xdo_dev_state_free(ipsec->xs);
> > -                       ipsec->xs->xso.real_dev = NULL;
> > +                       spin_unlock(&ipsec->x->lock);
> > +                       continue;
> >                 }
> > +
> > +               real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
> > ipsec->xs);
> > +               ipsec->xs->xso.real_dev = NULL;
> 
> Set xs->xso.real_dev = NULL is a good idea. As we will break
> in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no
> xs->xso.real_dev.
> 
> For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev
> after .xdo_dev_state_add() in case the following situation.
> 
> bond_ipsec_add_sa_all()
> spin_unlock(&ipsec->x->lock);
> ipsec->xs->xso.real_dev = real_dev;
>                                            __xfrm_state_delete x->state = DEAD
>                                               - bond_ipsec_del_sa()
>                                                 - .xdo_dev_state_delete()
> .xdo_dev_state_add()


Hmm, do we still need to the spin_lock in bond_ipsec_add_sa_all()? With
xs->xso.real_dev = NULL after bond_ipsec_del_sa_all(), it looks there is
no need the spin_lock in bond_ipsec_add_sa_all(). e.g.


diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 04b677d0c45b..3ada51c63207 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -537,15 +537,27 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
 	}
 
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		spin_lock_bh(&ipsec->xs->lock);
+		/* Skip dead xfrm states, they'll be freed later. */
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
+			spin_unlock_bh(&ipsec->xs->lock);
+			continue;
+		}
+
 		/* If new state is added before ipsec_lock acquired */
-		if (ipsec->xs->xso.real_dev == real_dev)
+		if (ipsec->xs->xso.real_dev == real_dev) {
+			spin_unlock_bh(&ipsec->xs->lock);
 			continue;
+		}
 
-		ipsec->xs->xso.real_dev = real_dev;
 		if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
 			slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
 			ipsec->xs->xso.real_dev = NULL;
 		}
+		/* Set real_dev after .xdo_dev_state_add in case
+		 * __xfrm_state_delete() is called in parallel
+		 */
+		ipsec->xs->xso.real_dev = real_dev;
 	}

The spin_lock here seems useless now. What do you think?

Thanks
Hangbin
Hangbin Liu March 6, 2025, 1:04 p.m. UTC | #6
On Wed, Mar 05, 2025 at 04:12:18PM +0000, Cosmin Ratiu wrote:
> +++ b/drivers/net/bonding/bond_main.c
> @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>  
>         mutex_lock(&bond->ipsec_lock);
>         list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> -               if (!ipsec->xs->xso.real_dev)
> +               spin_lock(&ipsec->x->lock);
> +               if (!ipsec->xs->xso.real_dev) {
> +                       spin_unlock(&ipsec->x->lock);
>                         continue;
> +               }
>  
>                 if (!real_dev->xfrmdev_ops ||
>                     !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>                         slave_warn(bond_dev, real_dev,
>                                    "%s: no slave
> xdo_dev_state_delete\n",
>                                    __func__);
> -               } else {
> -                       real_dev->xfrmdev_ops-
> >xdo_dev_state_delete(real_dev, ipsec->xs);
> -                       if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> -                               real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
> -                       ipsec->xs->xso.real_dev = NULL;
> +                       spin_unlock(&ipsec->x->lock);
> +                       continue;
>                 }
> +
> +               real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
> ipsec->xs);
> +               ipsec->xs->xso.real_dev = NULL;
> +               /* Unlock before freeing device state, it could sleep.
> */
> +               spin_unlock(&ipsec->x->lock);
> +               if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> +                       real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);

BTW, with setting real_dev = NULL here, I think

> To fix that, these entries should be freed here and the WARN_ON in
> bond_ipsec_free_sa() should be converted to an if...goto out, so that
> bond_ipsec_free_sa() calls would hit one of these conditions:
> 1. "if (!slave)", when no active device exists.
> 2. "if (!xs->xso.real_dev)", when xdo_dev_state_add() failed.
> 3. "if (xs->xso.real_dev != real_dev)", when a DEAD xs was already
> freed by bond_ipsec_del_sa_all() migration to a new device.
> In all 3 cases, xdo_dev_state_free() shouldn't be called, only xs
> removed from the bond->ipsec list.

The if (xs->xso.real_dev != real_dev) should never happen again.
As the real_dev = NULL, it will trigger 2 "if (!xs->xso.real_dev)"
directly.

And in bond_ipsec_add_sa_all(), it will set ipsec->xs->xso.real_dev =
real_dev, which the active slave already finished changing.

Thanks
Hangbin
Hangbin Liu March 6, 2025, 1:29 p.m. UTC | #7
On Thu, Mar 06, 2025 at 10:02:34AM +0000, Hangbin Liu wrote:
> > Set xs->xso.real_dev = NULL is a good idea. As we will break
> > in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no
> > xs->xso.real_dev.
> > 
> > For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev
> > after .xdo_dev_state_add() in case the following situation.
> > 
> Hmm, do we still need to the spin_lock in bond_ipsec_add_sa_all()? With
> xs->xso.real_dev = NULL after bond_ipsec_del_sa_all(), it looks there is
> no need the spin_lock in bond_ipsec_add_sa_all(). e.g.
> 
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 04b677d0c45b..3ada51c63207 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -537,15 +537,27 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
>  	}
>  
>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> +		spin_lock_bh(&ipsec->xs->lock);
> +		/* Skip dead xfrm states, they'll be freed later. */
> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> +			spin_unlock_bh(&ipsec->xs->lock);
> +			continue;
> +		}
> +
>  		/* If new state is added before ipsec_lock acquired */
> -		if (ipsec->xs->xso.real_dev == real_dev)
> +		if (ipsec->xs->xso.real_dev == real_dev) {
> +			spin_unlock_bh(&ipsec->xs->lock);
>  			continue;
> +		}
>  
> -		ipsec->xs->xso.real_dev = real_dev;
>  		if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
>  			slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
>  			ipsec->xs->xso.real_dev = NULL;
>  		}
> +		/* Set real_dev after .xdo_dev_state_add in case
> +		 * __xfrm_state_delete() is called in parallel
> +		 */
> +		ipsec->xs->xso.real_dev = real_dev;
>  	}

OK, please ignore this, the .xdo_dev_state_add() need xso.real_dev to
be set first. Then I'm still wonder how to avoid the race before
.xdo_dev_state_add() is called, e.g.

 bond_ipsec_add_sa_all()
 spin_lock_bh(&ipsec->xs->lock);
 ipsec->xs->xso.real_dev = real_dev;
 spin_unlock(&ipsec->x->lock);
                                            __xfrm_state_delete
                                               - bond_ipsec_del_sa()
                                                 - .xdo_dev_state_delete()
					       - bond_ipsec_free_sa()
					         - .xdo_dev_state_free()
 .xdo_dev_state_add()

Thanks
Hangbin
Cosmin Ratiu March 6, 2025, 1:37 p.m. UTC | #8
On Thu, 2025-03-06 at 10:02 +0000, Hangbin Liu wrote:
> > For bond_ipsec_add_sa_all(), I will move the xso.real_dev =
> > real_dev
> > after .xdo_dev_state_add() in case the following situation.

xso.real_dev needs to be initialized before the call to
xdo_dev_state_add, since many of the implementations look in
xso.real_dev to determine on which device to operate on.
So the ordering should be:
- get the lock
- set xso.real_dev to real_dev
- release the lock
- call xdo_dev_state_add
- if it fails, reacquire the lock and set the device to NULL.

Unfortunately, this doesn't seem to protect against the scenario below,
as after dropping the spinlock from bond_ipsec_add_sa_all,
bond_ipsec_del_sa can freely call xdo_dev_state_delete() on real_dev
before xdo_dev_state_add happens.

I don't know what to do in this case...

> > 
> > bond_ipsec_add_sa_all()
> > spin_unlock(&ipsec->x->lock);
> > ipsec->xs->xso.real_dev = real_dev;
> >                                 __xfrm_state_delete x->state = DEAD
> >                                   - bond_ipsec_del_sa()
> >                                     - .xdo_dev_state_delete()
> > .xdo_dev_state_add()

Cosmin.
Hangbin Liu March 7, 2025, 2:39 a.m. UTC | #9
On Thu, Mar 06, 2025 at 01:37:15PM +0000, Cosmin Ratiu wrote:
> On Thu, 2025-03-06 at 10:02 +0000, Hangbin Liu wrote:
> > > For bond_ipsec_add_sa_all(), I will move the xso.real_dev =
> > > real_dev
> > > after .xdo_dev_state_add() in case the following situation.
> 
> xso.real_dev needs to be initialized before the call to
> xdo_dev_state_add, since many of the implementations look in
> xso.real_dev to determine on which device to operate on.
> So the ordering should be:
> - get the lock
> - set xso.real_dev to real_dev
> - release the lock
> - call xdo_dev_state_add
> - if it fails, reacquire the lock and set the device to NULL.
> 
> Unfortunately, this doesn't seem to protect against the scenario below,
> as after dropping the spinlock from bond_ipsec_add_sa_all,
> bond_ipsec_del_sa can freely call xdo_dev_state_delete() on real_dev
> before xdo_dev_state_add happens.
> 
> I don't know what to do in this case...

Yes, me neither. How about add a note and leave it there until we
have a solution?

Regards
Hangbin
> 
> > > 
> > > bond_ipsec_add_sa_all()
> > > spin_unlock(&ipsec->x->lock);
> > > ipsec->xs->xso.real_dev = real_dev;
> > >                                 __xfrm_state_delete x->state = DEAD
> > >                                   - bond_ipsec_del_sa()
> > >                                     - .xdo_dev_state_delete()
> > > .xdo_dev_state_add()
> 
> Cosmin.
diff mbox series

Patch

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e45bba240cbc..06b060d9b031 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -537,15 +537,22 @@  static void bond_ipsec_add_sa_all(struct bonding *bond)
 	}
 
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		spin_lock_bh(&ipsec->xs->lock);
+		/* Skip dead xfrm states, they'll be freed later. */
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+			goto next;
+
 		/* If new state is added before ipsec_lock acquired */
 		if (ipsec->xs->xso.real_dev == real_dev)
-			continue;
+			goto next;
 
 		ipsec->xs->xso.real_dev = real_dev;
 		if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
 			slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
 			ipsec->xs->xso.real_dev = NULL;
 		}
+next:
+		spin_unlock_bh(&ipsec->xs->lock);
 	}
 out:
 	mutex_unlock(&bond->ipsec_lock);
@@ -560,7 +567,6 @@  static void bond_ipsec_del_sa(struct xfrm_state *xs)
 	struct net_device *bond_dev = xs->xso.dev;
 	struct net_device *real_dev;
 	netdevice_tracker tracker;
-	struct bond_ipsec *ipsec;
 	struct bonding *bond;
 	struct slave *slave;
 
@@ -592,15 +598,6 @@  static void bond_ipsec_del_sa(struct xfrm_state *xs)
 	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
 out:
 	netdev_put(real_dev, &tracker);
-	mutex_lock(&bond->ipsec_lock);
-	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
-		if (ipsec->xs == xs) {
-			list_del(&ipsec->list);
-			kfree(ipsec);
-			break;
-		}
-	}
-	mutex_unlock(&bond->ipsec_lock);
 }
 
 static void bond_ipsec_del_sa_all(struct bonding *bond)
@@ -617,8 +614,18 @@  static void bond_ipsec_del_sa_all(struct bonding *bond)
 
 	mutex_lock(&bond->ipsec_lock);
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		spin_lock_bh(&ipsec->xs->lock);
 		if (!ipsec->xs->xso.real_dev)
-			continue;
+			goto next;
+
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
+			/* already dead no need to delete again */
+			if (real_dev->xfrmdev_ops->xdo_dev_state_free)
+				real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			goto next;
+		}
 
 		if (!real_dev->xfrmdev_ops ||
 		    !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
@@ -631,6 +638,8 @@  static void bond_ipsec_del_sa_all(struct bonding *bond)
 			if (real_dev->xfrmdev_ops->xdo_dev_state_free)
 				real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
 		}
+next:
+		spin_unlock_bh(&ipsec->xs->lock);
 	}
 	mutex_unlock(&bond->ipsec_lock);
 }
@@ -640,6 +649,7 @@  static void bond_ipsec_free_sa(struct xfrm_state *xs)
 	struct net_device *bond_dev = xs->xso.dev;
 	struct net_device *real_dev;
 	netdevice_tracker tracker;
+	struct bond_ipsec *ipsec;
 	struct bonding *bond;
 	struct slave *slave;
 
@@ -659,11 +669,22 @@  static void bond_ipsec_free_sa(struct xfrm_state *xs)
 	if (!xs->xso.real_dev)
 		goto out;
 
-	WARN_ON(xs->xso.real_dev != real_dev);
+	mutex_lock(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->xs == xs) {
+			/* do xdo_dev_state_free if real_dev matches,
+			 * otherwise only remove the list
+			 */
+			if (real_dev && real_dev->xfrmdev_ops &&
+			    real_dev->xfrmdev_ops->xdo_dev_state_free)
+				real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			break;
+		}
+	}
+	mutex_unlock(&bond->ipsec_lock);
 
-	if (real_dev && real_dev->xfrmdev_ops &&
-	    real_dev->xfrmdev_ops->xdo_dev_state_free)
-		real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
 out:
 	netdev_put(real_dev, &tracker);
 }