Message ID | 20250411074958.2858496-7-cratiu@nvidia.com (mailing list archive) |
---|---|
State | Awaiting Upstream |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | xfrm & bonding: Correct use of xso.real_dev | expand |
On Fri, Apr 11, 2025 at 10:49:58AM +0300, Cosmin Ratiu wrote: > Refactor the bonding ipsec offload operations to fix a number of > long-standing control plane races between state migration and user > deletion and a few other issues. > > xfrm state deletion can happen concurrently with > bond_change_active_slave() operation. This manifests itself as a > bond_ipsec_del_sa() call with x->lock held, followed by a > bond_ipsec_free_sa() a bit later from a wq. The alternate path of > these calls coming from xfrm_dev_state_flush() can't happen, as that > needs the RTNL lock and bond_change_active_slave() already holds it. > > 1. bond_ipsec_del_sa_all() might call xdo_dev_state_delete() a second > time on an xfrm state that was concurrently killed. This is bad. > 2. bond_ipsec_add_sa_all() can add a state on the new device, but > pending bond_ipsec_free_sa() calls from the old device will then hit > the WARN_ON() and then, worse, call xdo_dev_state_free() on the new > device without a corresponding xdo_dev_state_delete(). > 3. Resolve a sleeping in atomic context introduced by the mentioned > "Fixes" commit. > > bond_ipsec_del_sa_all() and bond_ipsec_add_sa_all() now acquire x->lock > and check for x->km.state to help with problems 1 and 2. And since > xso.real_dev is now a private pointer managed by the bonding driver in > xfrm state, make better use of it to fully fix problems 1 and 2. In > bond_ipsec_del_sa_all(), set xso.real_dev to NULL while holding both the > mutex and x->lock, which makes sure that neither bond_ipsec_del_sa() nor > bond_ipsec_free_sa() could run concurrently. > > Fix problem 3 by moving the list cleanup (which requires the mutex) from > bond_ipsec_del_sa() (called from atomic context) to bond_ipsec_free_sa() > > Finally, simplify bond_ipsec_del_sa() and bond_ipsec_free_sa() by using > xso->real_dev directly, since it's now protected by locks and can be > trusted to always reflect the offload device. > > Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") > Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com> > Reviewed-by: Leon Romanovsky <leonro@nvidia.com> > --- > drivers/net/bonding/bond_main.c | 82 +++++++++++++++------------------ > include/net/xfrm.h | 7 ++- > 2 files changed, 41 insertions(+), 48 deletions(-) > > diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c > index 14f7c9712ad4..8ed8c29659a0 100644 > --- a/drivers/net/bonding/bond_main.c > +++ b/drivers/net/bonding/bond_main.c > @@ -545,7 +545,20 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) > slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); > continue; > } > + > + spin_lock_bh(&ipsec->xs->lock); > + /* xs might have been killed by the user during the migration > + * to the new dev, but bond_ipsec_del_sa() should have done > + * nothing, as xso.real_dev is NULL. > + * Delete it from the device we just added it to. The pending > + * bond_ipsec_free_sa() call will do the rest of the cleanup. > + */ > + if (ipsec->xs->km.state == XFRM_STATE_DEAD && > + real_dev->xfrmdev_ops->xdo_dev_state_delete) > + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, > + ipsec->xs); > ipsec->xs->xso.real_dev = real_dev; > + spin_unlock_bh(&ipsec->xs->lock); > } > out: > mutex_unlock(&bond->ipsec_lock); > @@ -560,48 +573,20 @@ static void bond_ipsec_del_sa(struct net_device *bond_dev, > struct xfrm_state *xs) > { > struct net_device *real_dev; > - netdevice_tracker tracker; > - struct bond_ipsec *ipsec; > - struct bonding *bond; > - struct slave *slave; > > - if (!bond_dev) > + if (!bond_dev || !xs->xso.real_dev) > return; > > - rcu_read_lock(); > - bond = netdev_priv(bond_dev); > - slave = rcu_dereference(bond->curr_active_slave); > - real_dev = slave ? slave->dev : NULL; > - netdev_hold(real_dev, &tracker, GFP_ATOMIC); > - rcu_read_unlock(); > - > - if (!slave) > - goto out; > - > - if (!xs->xso.real_dev) > - goto out; > - > - WARN_ON(xs->xso.real_dev != real_dev); > + real_dev = xs->xso.real_dev; > > if (!real_dev->xfrmdev_ops || > !real_dev->xfrmdev_ops->xdo_dev_state_delete || > netif_is_bond_master(real_dev)) { > slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); > - goto out; > + return; > } > > real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, xs); > -out: > - netdev_put(real_dev, &tracker); > - mutex_lock(&bond->ipsec_lock); > - list_for_each_entry(ipsec, &bond->ipsec_list, list) { > - if (ipsec->xs == xs) { > - list_del(&ipsec->list); > - kfree(ipsec); > - break; > - } > - } > - mutex_unlock(&bond->ipsec_lock); > } > > static void bond_ipsec_del_sa_all(struct bonding *bond) > @@ -629,9 +614,15 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) > __func__); > continue; > } > + > + spin_lock_bh(&ipsec->xs->lock); > ipsec->xs->xso.real_dev = NULL; > - real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, > - ipsec->xs); > + /* Don't double delete states killed by the user. */ > + if (ipsec->xs->km.state != XFRM_STATE_DEAD) > + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, > + ipsec->xs); > + spin_unlock_bh(&ipsec->xs->lock); > + > if (real_dev->xfrmdev_ops->xdo_dev_state_free) > real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, > ipsec->xs); > @@ -643,34 +634,33 @@ static void bond_ipsec_free_sa(struct net_device *bond_dev, > struct xfrm_state *xs) > { > struct net_device *real_dev; > - netdevice_tracker tracker; > + struct bond_ipsec *ipsec; > struct bonding *bond; > - struct slave *slave; > > if (!bond_dev) > return; > > - rcu_read_lock(); > bond = netdev_priv(bond_dev); > - slave = rcu_dereference(bond->curr_active_slave); > - real_dev = slave ? slave->dev : NULL; > - netdev_hold(real_dev, &tracker, GFP_ATOMIC); > - rcu_read_unlock(); > - > - if (!slave) > - goto out; > > + mutex_lock(&bond->ipsec_lock); > if (!xs->xso.real_dev) > goto out; > > - WARN_ON(xs->xso.real_dev != real_dev); > + real_dev = xs->xso.real_dev; > > xs->xso.real_dev = NULL; > - if (real_dev && real_dev->xfrmdev_ops && > + if (real_dev->xfrmdev_ops && > real_dev->xfrmdev_ops->xdo_dev_state_free) > real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs); > out: > - netdev_put(real_dev, &tracker); > + list_for_each_entry(ipsec, &bond->ipsec_list, list) { > + if (ipsec->xs == xs) { > + list_del(&ipsec->list); > + kfree(ipsec); > + break; > + } > + } > + mutex_unlock(&bond->ipsec_lock); > } > > /** > diff --git a/include/net/xfrm.h b/include/net/xfrm.h > index 3d2f6c879311..b7e8f3f49627 100644 > --- a/include/net/xfrm.h > +++ b/include/net/xfrm.h > @@ -154,8 +154,11 @@ struct xfrm_dev_offload { > */ > struct net_device *dev; > netdevice_tracker dev_tracker; > - /* This is a private pointer used by the bonding driver. > - * Device drivers should not use it. > + /* This is a private pointer used by the bonding driver (and eventually > + * should be moved there). Device drivers should not use it. > + * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases, > + * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock > + * is held. > */ > struct net_device *real_dev; > unsigned long offload_handle; > -- > 2.45.0 > Tested-by: Hangbin Liu <liuhangbin@gmail.com> Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 14f7c9712ad4..8ed8c29659a0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -545,7 +545,20 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); continue; } + + spin_lock_bh(&ipsec->xs->lock); + /* xs might have been killed by the user during the migration + * to the new dev, but bond_ipsec_del_sa() should have done + * nothing, as xso.real_dev is NULL. + * Delete it from the device we just added it to. The pending + * bond_ipsec_free_sa() call will do the rest of the cleanup. + */ + if (ipsec->xs->km.state == XFRM_STATE_DEAD && + real_dev->xfrmdev_ops->xdo_dev_state_delete) + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, + ipsec->xs); ipsec->xs->xso.real_dev = real_dev; + spin_unlock_bh(&ipsec->xs->lock); } out: mutex_unlock(&bond->ipsec_lock); @@ -560,48 +573,20 @@ static void bond_ipsec_del_sa(struct net_device *bond_dev, struct xfrm_state *xs) { struct net_device *real_dev; - netdevice_tracker tracker; - struct bond_ipsec *ipsec; - struct bonding *bond; - struct slave *slave; - if (!bond_dev) + if (!bond_dev || !xs->xso.real_dev) return; - rcu_read_lock(); - bond = netdev_priv(bond_dev); - slave = rcu_dereference(bond->curr_active_slave); - real_dev = slave ? slave->dev : NULL; - netdev_hold(real_dev, &tracker, GFP_ATOMIC); - rcu_read_unlock(); - - if (!slave) - goto out; - - if (!xs->xso.real_dev) - goto out; - - WARN_ON(xs->xso.real_dev != real_dev); + real_dev = xs->xso.real_dev; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) { slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); - goto out; + return; } real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, xs); -out: - netdev_put(real_dev, &tracker); - mutex_lock(&bond->ipsec_lock); - list_for_each_entry(ipsec, &bond->ipsec_list, list) { - if (ipsec->xs == xs) { - list_del(&ipsec->list); - kfree(ipsec); - break; - } - } - mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -629,9 +614,15 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) __func__); continue; } + + spin_lock_bh(&ipsec->xs->lock); ipsec->xs->xso.real_dev = NULL; - real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, - ipsec->xs); + /* Don't double delete states killed by the user. */ + if (ipsec->xs->km.state != XFRM_STATE_DEAD) + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, + ipsec->xs); + spin_unlock_bh(&ipsec->xs->lock); + if (real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, ipsec->xs); @@ -643,34 +634,33 @@ static void bond_ipsec_free_sa(struct net_device *bond_dev, struct xfrm_state *xs) { struct net_device *real_dev; - netdevice_tracker tracker; + struct bond_ipsec *ipsec; struct bonding *bond; - struct slave *slave; if (!bond_dev) return; - rcu_read_lock(); bond = netdev_priv(bond_dev); - slave = rcu_dereference(bond->curr_active_slave); - real_dev = slave ? slave->dev : NULL; - netdev_hold(real_dev, &tracker, GFP_ATOMIC); - rcu_read_unlock(); - - if (!slave) - goto out; + mutex_lock(&bond->ipsec_lock); if (!xs->xso.real_dev) goto out; - WARN_ON(xs->xso.real_dev != real_dev); + real_dev = xs->xso.real_dev; xs->xso.real_dev = NULL; - if (real_dev && real_dev->xfrmdev_ops && + if (real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs); out: - netdev_put(real_dev, &tracker); + list_for_each_entry(ipsec, &bond->ipsec_list, list) { + if (ipsec->xs == xs) { + list_del(&ipsec->list); + kfree(ipsec); + break; + } + } + mutex_unlock(&bond->ipsec_lock); } /** diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3d2f6c879311..b7e8f3f49627 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -154,8 +154,11 @@ struct xfrm_dev_offload { */ struct net_device *dev; netdevice_tracker dev_tracker; - /* This is a private pointer used by the bonding driver. - * Device drivers should not use it. + /* This is a private pointer used by the bonding driver (and eventually + * should be moved there). Device drivers should not use it. + * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases, + * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock + * is held. */ struct net_device *real_dev; unsigned long offload_handle;