Message ID | 20250304131120.31135-2-liuhangbin@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | bond: fix xfrm offload issues | expand |
On 3/4/25 15:11, Hangbin Liu wrote: > The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers > a warning: > > BUG: sleeping function called from invalid context at... > > Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa, > which is not held by spin_lock_bh(). > > Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the > XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered > again in bond_ipsec_free_sa(). > > For bond_ipsec_free_sa(), there are now three conditions: > > 1. if (!slave): When no active device exists. > 2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails. > 3. if (xs->xso.real_dev != real_dev): When an xs has already been freed > by bond_ipsec_del_sa_all() due to migration, and the active slave has > changed to a new device. At the same time, the xs is marked as DEAD > due to the XFRM entry is removed, triggering xfrm_state_gc_task() and > bond_ipsec_free_sa(). > > In all three cases, xdo_dev_state_free() should not be called, only xs > should be removed from bond->ipsec list. > > At the same time, protect bond_ipsec_del_sa_all and bond_ipsec_add_sa_all > with x->lock for each xs being processed. This prevents XFRM from > concurrently initiating add/delete operations on the managed states. > > Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") > Reported-by: Jakub Kicinski <kuba@kernel.org> > Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org > Suggested-by: Cosmin Ratiu <cratiu@nvidia.com> > Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> > --- > drivers/net/bonding/bond_main.c | 53 +++++++++++++++++++++++---------- > 1 file changed, 37 insertions(+), 16 deletions(-) > > diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c > index e45bba240cbc..06b060d9b031 100644 > --- a/drivers/net/bonding/bond_main.c > +++ b/drivers/net/bonding/bond_main.c > @@ -537,15 +537,22 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) > } > > list_for_each_entry(ipsec, &bond->ipsec_list, list) { > + spin_lock_bh(&ipsec->xs->lock); > + /* Skip dead xfrm states, they'll be freed later. */ > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) > + goto next; > + > /* If new state is added before ipsec_lock acquired */ > if (ipsec->xs->xso.real_dev == real_dev) > - continue; > + goto next; > > ipsec->xs->xso.real_dev = real_dev; > if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { > slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); > ipsec->xs->xso.real_dev = NULL; > } > +next: > + spin_unlock_bh(&ipsec->xs->lock); > } > out: > mutex_unlock(&bond->ipsec_lock); > @@ -560,7 +567,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) > struct net_device *bond_dev = xs->xso.dev; > struct net_device *real_dev; > netdevice_tracker tracker; > - struct bond_ipsec *ipsec; > struct bonding *bond; > struct slave *slave; > > @@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) > real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); > out: > netdev_put(real_dev, &tracker); > - mutex_lock(&bond->ipsec_lock); > - list_for_each_entry(ipsec, &bond->ipsec_list, list) { > - if (ipsec->xs == xs) { > - list_del(&ipsec->list); > - kfree(ipsec); > - break; > - } > - } > - mutex_unlock(&bond->ipsec_lock); > } > > static void bond_ipsec_del_sa_all(struct bonding *bond) > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) > > mutex_lock(&bond->ipsec_lock); > list_for_each_entry(ipsec, &bond->ipsec_list, list) { Second time - you should use list_for_each_entry_safe if you're walking and deleting elements from the list. > + spin_lock_bh(&ipsec->xs->lock); > if (!ipsec->xs->xso.real_dev) > - continue; > + goto next; > + > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { > + /* already dead no need to delete again */ > + if (real_dev->xfrmdev_ops->xdo_dev_state_free) > + real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); Have you checked if .xdo_dev_state_free can sleep? I see at least one that can: mlx5e_xfrm_free_state(). > + list_del(&ipsec->list); > + kfree(ipsec); > + goto next; > + } > > if (!real_dev->xfrmdev_ops || > !real_dev->xfrmdev_ops->xdo_dev_state_delete || > @@ -631,6 +638,8 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) > if (real_dev->xfrmdev_ops->xdo_dev_state_free) > real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); > } > +next: > + spin_unlock_bh(&ipsec->xs->lock); > } > mutex_unlock(&bond->ipsec_lock); > } > @@ -640,6 +649,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) > struct net_device *bond_dev = xs->xso.dev; > struct net_device *real_dev; > netdevice_tracker tracker; > + struct bond_ipsec *ipsec; > struct bonding *bond; > struct slave *slave; > > @@ -659,11 +669,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) > if (!xs->xso.real_dev) > goto out; > > - WARN_ON(xs->xso.real_dev != real_dev); > + mutex_lock(&bond->ipsec_lock); > + list_for_each_entry(ipsec, &bond->ipsec_list, list) { > + if (ipsec->xs == xs) { > + /* do xdo_dev_state_free if real_dev matches, > + * otherwise only remove the list > + */ > + if (real_dev && real_dev->xfrmdev_ops && > + real_dev->xfrmdev_ops->xdo_dev_state_free) > + real_dev->xfrmdev_ops->xdo_dev_state_free(xs); > + list_del(&ipsec->list); > + kfree(ipsec); > + break; > + } > + } > + mutex_unlock(&bond->ipsec_lock); > > - if (real_dev && real_dev->xfrmdev_ops && > - real_dev->xfrmdev_ops->xdo_dev_state_free) > - real_dev->xfrmdev_ops->xdo_dev_state_free(xs); > out: > netdev_put(real_dev, &tracker); > }
On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote: > > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) > > > > mutex_lock(&bond->ipsec_lock); > > list_for_each_entry(ipsec, &bond->ipsec_list, list) { > > Second time - you should use list_for_each_entry_safe if you're walking and deleting > elements from the list. Sorry, I missed this comment. I will update in next version. > > > + spin_lock_bh(&ipsec->xs->lock); > > if (!ipsec->xs->xso.real_dev) > > - continue; > > + goto next; > > + > > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { > > + /* already dead no need to delete again */ > > + if (real_dev->xfrmdev_ops->xdo_dev_state_free) > > + real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); > > Have you checked if .xdo_dev_state_free can sleep? > I see at least one that can: mlx5e_xfrm_free_state(). Hmm, This brings us back to the initial problem. We tried to avoid calling a spin lock in a sleep context (bond_ipsec_del_sa), but now the new code encounters this issue again. With your reply, I also checked the xdo_dev_state_add() in bond_ipsec_add_sa_all(), which may also sleep, e.g. mlx5e_xfrm_add_state(), If we unlock the spin lock, then the race came back again. Any idea about this? thanks Hangbin
On Wed, 2025-03-05 at 14:13 +0000, Hangbin Liu wrote: > On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote: > > > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct > > > bonding *bond) > > > > > > mutex_lock(&bond->ipsec_lock); > > > list_for_each_entry(ipsec, &bond->ipsec_list, list) { > > > > Second time - you should use list_for_each_entry_safe if you're > > walking and deleting > > elements from the list. > > Sorry, I missed this comment. I will update in next version. > > > > > > + spin_lock_bh(&ipsec->xs->lock); > > > if (!ipsec->xs->xso.real_dev) > > > - continue; > > > + goto next; > > > + > > > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { > > > + /* already dead no need to delete again > > > */ > > > + if (real_dev->xfrmdev_ops- > > > >xdo_dev_state_free) > > > + real_dev->xfrmdev_ops- > > > >xdo_dev_state_free(ipsec->xs); > > > > Have you checked if .xdo_dev_state_free can sleep? > > I see at least one that can: mlx5e_xfrm_free_state(). > > Hmm, This brings us back to the initial problem. We tried to avoid > calling > a spin lock in a sleep context (bond_ipsec_del_sa), but now the new > code > encounters this issue again. The reason the mutex was added (instead of the spinlock used before) was exactly because the add and free offload operations could sleep. > With your reply, I also checked the xdo_dev_state_add() in > bond_ipsec_add_sa_all(), which may also sleep, e.g. > mlx5e_xfrm_add_state(), > > If we unlock the spin lock, then the race came back again. > > Any idea about this? The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all releases x->lock, bond_ipsec_del_sa can immediately be called, followed by bond_ipsec_free_sa. Maybe dropping x->lock after setting real_dev to NULL? I checked, real_dev is not used anywhere on the free calls, I think. I have another series refactoring things around real_dev, I hope to be able to send it soon. Here's a sketch of this idea: --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { - if (!ipsec->xs->xso.real_dev) + spin_lock(&ipsec->x->lock); + if (!ipsec->xs->xso.real_dev) { + spin_unlock(&ipsec->x->lock); continue; + } if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); - } else { - real_dev->xfrmdev_ops- >xdo_dev_state_delete(real_dev, ipsec->xs); - if (real_dev->xfrmdev_ops->xdo_dev_state_free) - real_dev->xfrmdev_ops- >xdo_dev_state_free(ipsec->xs); - ipsec->xs->xso.real_dev = NULL; + spin_unlock(&ipsec->x->lock); + continue; } + + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, ipsec->xs); + ipsec->xs->xso.real_dev = NULL; + /* Unlock before freeing device state, it could sleep. */ + spin_unlock(&ipsec->x->lock); + if (real_dev->xfrmdev_ops->xdo_dev_state_free) + real_dev->xfrmdev_ops- >xdo_dev_state_free(ipsec->xs); Cosmin.
On Wed, Mar 05, 2025 at 04:12:18PM +0000, Cosmin Ratiu wrote: > On Wed, 2025-03-05 at 14:13 +0000, Hangbin Liu wrote: > > On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote: > > > > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct > > > > bonding *bond) > > > > > > > > mutex_lock(&bond->ipsec_lock); > > > > list_for_each_entry(ipsec, &bond->ipsec_list, list) { > > > > > > Second time - you should use list_for_each_entry_safe if you're > > > walking and deleting > > > elements from the list. > > > > Sorry, I missed this comment. I will update in next version. > > > > > > > > > + spin_lock_bh(&ipsec->xs->lock); > > > > if (!ipsec->xs->xso.real_dev) > > > > - continue; > > > > + goto next; > > > > + > > > > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { > > > > + /* already dead no need to delete again > > > > */ > > > > + if (real_dev->xfrmdev_ops- > > > > >xdo_dev_state_free) > > > > + real_dev->xfrmdev_ops- > > > > >xdo_dev_state_free(ipsec->xs); > > > > > > Have you checked if .xdo_dev_state_free can sleep? > > > I see at least one that can: mlx5e_xfrm_free_state(). > > > > Hmm, This brings us back to the initial problem. We tried to avoid > > calling > > a spin lock in a sleep context (bond_ipsec_del_sa), but now the new > > code > > encounters this issue again. > > The reason the mutex was added (instead of the spinlock used before) > was exactly because the add and free offload operations could sleep. > > > With your reply, I also checked the xdo_dev_state_add() in > > bond_ipsec_add_sa_all(), which may also sleep, e.g. > > mlx5e_xfrm_add_state(), > > > > If we unlock the spin lock, then the race came back again. > > > > Any idea about this? > > The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus > bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all > releases x->lock, bond_ipsec_del_sa can immediately be called, followed > by bond_ipsec_free_sa. > Maybe dropping x->lock after setting real_dev to NULL? I checked, > real_dev is not used anywhere on the free calls, I think. I have > another series refactoring things around real_dev, I hope to be able to > send it soon. > > Here's a sketch of this idea: > > --- a/drivers/net/bonding/bond_main.c > +++ b/drivers/net/bonding/bond_main.c > @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding > *bond) > > mutex_lock(&bond->ipsec_lock); > list_for_each_entry(ipsec, &bond->ipsec_list, list) { > - if (!ipsec->xs->xso.real_dev) > + spin_lock(&ipsec->x->lock); > + if (!ipsec->xs->xso.real_dev) { > + spin_unlock(&ipsec->x->lock); > continue; > + } > > if (!real_dev->xfrmdev_ops || > !real_dev->xfrmdev_ops->xdo_dev_state_delete || > @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding > *bond) > slave_warn(bond_dev, real_dev, > "%s: no slave > xdo_dev_state_delete\n", > __func__); > - } else { > - real_dev->xfrmdev_ops- > >xdo_dev_state_delete(real_dev, ipsec->xs); > - if (real_dev->xfrmdev_ops->xdo_dev_state_free) > - real_dev->xfrmdev_ops- > >xdo_dev_state_free(ipsec->xs); > - ipsec->xs->xso.real_dev = NULL; > + spin_unlock(&ipsec->x->lock); > + continue; > } > + > + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, > ipsec->xs); > + ipsec->xs->xso.real_dev = NULL; Set xs->xso.real_dev = NULL is a good idea. As we will break in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no xs->xso.real_dev. For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev after .xdo_dev_state_add() in case the following situation. bond_ipsec_add_sa_all() spin_unlock(&ipsec->x->lock); ipsec->xs->xso.real_dev = real_dev; __xfrm_state_delete x->state = DEAD - bond_ipsec_del_sa() - .xdo_dev_state_delete() .xdo_dev_state_add() Thanks Hangbin > + /* Unlock before freeing device state, it could sleep. > */ > + spin_unlock(&ipsec->x->lock); > + if (real_dev->xfrmdev_ops->xdo_dev_state_free) > + real_dev->xfrmdev_ops- > >xdo_dev_state_free(ipsec->xs); > > Cosmin.
On Thu, Mar 06, 2025 at 09:37:53AM +0000, Hangbin Liu wrote: > > > > The reason the mutex was added (instead of the spinlock used before) > > was exactly because the add and free offload operations could sleep. > > > > > With your reply, I also checked the xdo_dev_state_add() in > > > bond_ipsec_add_sa_all(), which may also sleep, e.g. > > > mlx5e_xfrm_add_state(), > > > > > > If we unlock the spin lock, then the race came back again. > > > > > > Any idea about this? > > > > The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus > > bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all > > releases x->lock, bond_ipsec_del_sa can immediately be called, followed > > by bond_ipsec_free_sa. > > Maybe dropping x->lock after setting real_dev to NULL? I checked, > > real_dev is not used anywhere on the free calls, I think. I have > > another series refactoring things around real_dev, I hope to be able to > > send it soon. > > > > Here's a sketch of this idea: > > > > --- a/drivers/net/bonding/bond_main.c > > +++ b/drivers/net/bonding/bond_main.c > > @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding > > *bond) > > > > mutex_lock(&bond->ipsec_lock); > > list_for_each_entry(ipsec, &bond->ipsec_list, list) { > > - if (!ipsec->xs->xso.real_dev) > > + spin_lock(&ipsec->x->lock); > > + if (!ipsec->xs->xso.real_dev) { > > + spin_unlock(&ipsec->x->lock); > > continue; > > + } > > > > if (!real_dev->xfrmdev_ops || > > !real_dev->xfrmdev_ops->xdo_dev_state_delete || > > @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding > > *bond) > > slave_warn(bond_dev, real_dev, > > "%s: no slave > > xdo_dev_state_delete\n", > > __func__); > > - } else { > > - real_dev->xfrmdev_ops- > > >xdo_dev_state_delete(real_dev, ipsec->xs); > > - if (real_dev->xfrmdev_ops->xdo_dev_state_free) > > - real_dev->xfrmdev_ops- > > >xdo_dev_state_free(ipsec->xs); > > - ipsec->xs->xso.real_dev = NULL; > > + spin_unlock(&ipsec->x->lock); > > + continue; > > } > > + > > + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, > > ipsec->xs); > > + ipsec->xs->xso.real_dev = NULL; > > Set xs->xso.real_dev = NULL is a good idea. As we will break > in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no > xs->xso.real_dev. > > For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev > after .xdo_dev_state_add() in case the following situation. > > bond_ipsec_add_sa_all() > spin_unlock(&ipsec->x->lock); > ipsec->xs->xso.real_dev = real_dev; > __xfrm_state_delete x->state = DEAD > - bond_ipsec_del_sa() > - .xdo_dev_state_delete() > .xdo_dev_state_add() Hmm, do we still need to the spin_lock in bond_ipsec_add_sa_all()? With xs->xso.real_dev = NULL after bond_ipsec_del_sa_all(), it looks there is no need the spin_lock in bond_ipsec_add_sa_all(). e.g. diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 04b677d0c45b..3ada51c63207 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -537,15 +537,27 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) } list_for_each_entry(ipsec, &bond->ipsec_list, list) { + spin_lock_bh(&ipsec->xs->lock); + /* Skip dead xfrm states, they'll be freed later. */ + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { + spin_unlock_bh(&ipsec->xs->lock); + continue; + } + /* If new state is added before ipsec_lock acquired */ - if (ipsec->xs->xso.real_dev == real_dev) + if (ipsec->xs->xso.real_dev == real_dev) { + spin_unlock_bh(&ipsec->xs->lock); continue; + } - ipsec->xs->xso.real_dev = real_dev; if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } + /* Set real_dev after .xdo_dev_state_add in case + * __xfrm_state_delete() is called in parallel + */ + ipsec->xs->xso.real_dev = real_dev; } The spin_lock here seems useless now. What do you think? Thanks Hangbin
On Wed, Mar 05, 2025 at 04:12:18PM +0000, Cosmin Ratiu wrote: > +++ b/drivers/net/bonding/bond_main.c > @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding > *bond) > > mutex_lock(&bond->ipsec_lock); > list_for_each_entry(ipsec, &bond->ipsec_list, list) { > - if (!ipsec->xs->xso.real_dev) > + spin_lock(&ipsec->x->lock); > + if (!ipsec->xs->xso.real_dev) { > + spin_unlock(&ipsec->x->lock); > continue; > + } > > if (!real_dev->xfrmdev_ops || > !real_dev->xfrmdev_ops->xdo_dev_state_delete || > @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding > *bond) > slave_warn(bond_dev, real_dev, > "%s: no slave > xdo_dev_state_delete\n", > __func__); > - } else { > - real_dev->xfrmdev_ops- > >xdo_dev_state_delete(real_dev, ipsec->xs); > - if (real_dev->xfrmdev_ops->xdo_dev_state_free) > - real_dev->xfrmdev_ops- > >xdo_dev_state_free(ipsec->xs); > - ipsec->xs->xso.real_dev = NULL; > + spin_unlock(&ipsec->x->lock); > + continue; > } > + > + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, > ipsec->xs); > + ipsec->xs->xso.real_dev = NULL; > + /* Unlock before freeing device state, it could sleep. > */ > + spin_unlock(&ipsec->x->lock); > + if (real_dev->xfrmdev_ops->xdo_dev_state_free) > + real_dev->xfrmdev_ops- > >xdo_dev_state_free(ipsec->xs); BTW, with setting real_dev = NULL here, I think > To fix that, these entries should be freed here and the WARN_ON in > bond_ipsec_free_sa() should be converted to an if...goto out, so that > bond_ipsec_free_sa() calls would hit one of these conditions: > 1. "if (!slave)", when no active device exists. > 2. "if (!xs->xso.real_dev)", when xdo_dev_state_add() failed. > 3. "if (xs->xso.real_dev != real_dev)", when a DEAD xs was already > freed by bond_ipsec_del_sa_all() migration to a new device. > In all 3 cases, xdo_dev_state_free() shouldn't be called, only xs > removed from the bond->ipsec list. The if (xs->xso.real_dev != real_dev) should never happen again. As the real_dev = NULL, it will trigger 2 "if (!xs->xso.real_dev)" directly. And in bond_ipsec_add_sa_all(), it will set ipsec->xs->xso.real_dev = real_dev, which the active slave already finished changing. Thanks Hangbin
On Thu, Mar 06, 2025 at 10:02:34AM +0000, Hangbin Liu wrote: > > Set xs->xso.real_dev = NULL is a good idea. As we will break > > in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no > > xs->xso.real_dev. > > > > For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev > > after .xdo_dev_state_add() in case the following situation. > > > Hmm, do we still need to the spin_lock in bond_ipsec_add_sa_all()? With > xs->xso.real_dev = NULL after bond_ipsec_del_sa_all(), it looks there is > no need the spin_lock in bond_ipsec_add_sa_all(). e.g. > > > diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c > index 04b677d0c45b..3ada51c63207 100644 > --- a/drivers/net/bonding/bond_main.c > +++ b/drivers/net/bonding/bond_main.c > @@ -537,15 +537,27 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) > } > > list_for_each_entry(ipsec, &bond->ipsec_list, list) { > + spin_lock_bh(&ipsec->xs->lock); > + /* Skip dead xfrm states, they'll be freed later. */ > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { > + spin_unlock_bh(&ipsec->xs->lock); > + continue; > + } > + > /* If new state is added before ipsec_lock acquired */ > - if (ipsec->xs->xso.real_dev == real_dev) > + if (ipsec->xs->xso.real_dev == real_dev) { > + spin_unlock_bh(&ipsec->xs->lock); > continue; > + } > > - ipsec->xs->xso.real_dev = real_dev; > if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { > slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); > ipsec->xs->xso.real_dev = NULL; > } > + /* Set real_dev after .xdo_dev_state_add in case > + * __xfrm_state_delete() is called in parallel > + */ > + ipsec->xs->xso.real_dev = real_dev; > } OK, please ignore this, the .xdo_dev_state_add() need xso.real_dev to be set first. Then I'm still wonder how to avoid the race before .xdo_dev_state_add() is called, e.g. bond_ipsec_add_sa_all() spin_lock_bh(&ipsec->xs->lock); ipsec->xs->xso.real_dev = real_dev; spin_unlock(&ipsec->x->lock); __xfrm_state_delete - bond_ipsec_del_sa() - .xdo_dev_state_delete() - bond_ipsec_free_sa() - .xdo_dev_state_free() .xdo_dev_state_add() Thanks Hangbin
On Thu, 2025-03-06 at 10:02 +0000, Hangbin Liu wrote: > > For bond_ipsec_add_sa_all(), I will move the xso.real_dev = > > real_dev > > after .xdo_dev_state_add() in case the following situation. xso.real_dev needs to be initialized before the call to xdo_dev_state_add, since many of the implementations look in xso.real_dev to determine on which device to operate on. So the ordering should be: - get the lock - set xso.real_dev to real_dev - release the lock - call xdo_dev_state_add - if it fails, reacquire the lock and set the device to NULL. Unfortunately, this doesn't seem to protect against the scenario below, as after dropping the spinlock from bond_ipsec_add_sa_all, bond_ipsec_del_sa can freely call xdo_dev_state_delete() on real_dev before xdo_dev_state_add happens. I don't know what to do in this case... > > > > bond_ipsec_add_sa_all() > > spin_unlock(&ipsec->x->lock); > > ipsec->xs->xso.real_dev = real_dev; > > __xfrm_state_delete x->state = DEAD > > - bond_ipsec_del_sa() > > - .xdo_dev_state_delete() > > .xdo_dev_state_add() Cosmin.
On Thu, Mar 06, 2025 at 01:37:15PM +0000, Cosmin Ratiu wrote: > On Thu, 2025-03-06 at 10:02 +0000, Hangbin Liu wrote: > > > For bond_ipsec_add_sa_all(), I will move the xso.real_dev = > > > real_dev > > > after .xdo_dev_state_add() in case the following situation. > > xso.real_dev needs to be initialized before the call to > xdo_dev_state_add, since many of the implementations look in > xso.real_dev to determine on which device to operate on. > So the ordering should be: > - get the lock > - set xso.real_dev to real_dev > - release the lock > - call xdo_dev_state_add > - if it fails, reacquire the lock and set the device to NULL. > > Unfortunately, this doesn't seem to protect against the scenario below, > as after dropping the spinlock from bond_ipsec_add_sa_all, > bond_ipsec_del_sa can freely call xdo_dev_state_delete() on real_dev > before xdo_dev_state_add happens. > > I don't know what to do in this case... Yes, me neither. How about add a note and leave it there until we have a solution? Regards Hangbin > > > > > > > bond_ipsec_add_sa_all() > > > spin_unlock(&ipsec->x->lock); > > > ipsec->xs->xso.real_dev = real_dev; > > > __xfrm_state_delete x->state = DEAD > > > - bond_ipsec_del_sa() > > > - .xdo_dev_state_delete() > > > .xdo_dev_state_add() > > Cosmin.
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbc..06b060d9b031 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -537,15 +537,22 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) } list_for_each_entry(ipsec, &bond->ipsec_list, list) { + spin_lock_bh(&ipsec->xs->lock); + /* Skip dead xfrm states, they'll be freed later. */ + if (ipsec->xs->km.state == XFRM_STATE_DEAD) + goto next; + /* If new state is added before ipsec_lock acquired */ if (ipsec->xs->xso.real_dev == real_dev) - continue; + goto next; ipsec->xs->xso.real_dev = real_dev; if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } +next: + spin_unlock_bh(&ipsec->xs->lock); } out: mutex_unlock(&bond->ipsec_lock); @@ -560,7 +567,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; - struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; @@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: netdev_put(real_dev, &tracker); - mutex_lock(&bond->ipsec_lock); - list_for_each_entry(ipsec, &bond->ipsec_list, list) { - if (ipsec->xs == xs) { - list_del(&ipsec->list); - kfree(ipsec); - break; - } - } - mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { + spin_lock_bh(&ipsec->xs->lock); if (!ipsec->xs->xso.real_dev) - continue; + goto next; + + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { + /* already dead no need to delete again */ + if (real_dev->xfrmdev_ops->xdo_dev_state_free) + real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); + list_del(&ipsec->list); + kfree(ipsec); + goto next; + } if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || @@ -631,6 +638,8 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) if (real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); } +next: + spin_unlock_bh(&ipsec->xs->lock); } mutex_unlock(&bond->ipsec_lock); } @@ -640,6 +649,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; + struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; @@ -659,11 +669,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out; - WARN_ON(xs->xso.real_dev != real_dev); + mutex_lock(&bond->ipsec_lock); + list_for_each_entry(ipsec, &bond->ipsec_list, list) { + if (ipsec->xs == xs) { + /* do xdo_dev_state_free if real_dev matches, + * otherwise only remove the list + */ + if (real_dev && real_dev->xfrmdev_ops && + real_dev->xfrmdev_ops->xdo_dev_state_free) + real_dev->xfrmdev_ops->xdo_dev_state_free(xs); + list_del(&ipsec->list); + kfree(ipsec); + break; + } + } + mutex_unlock(&bond->ipsec_lock); - if (real_dev && real_dev->xfrmdev_ops && - real_dev->xfrmdev_ops->xdo_dev_state_free) - real_dev->xfrmdev_ops->xdo_dev_state_free(xs); out: netdev_put(real_dev, &tracker); }
The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers a warning: BUG: sleeping function called from invalid context at... Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa, which is not held by spin_lock_bh(). Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered again in bond_ipsec_free_sa(). For bond_ipsec_free_sa(), there are now three conditions: 1. if (!slave): When no active device exists. 2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails. 3. if (xs->xso.real_dev != real_dev): When an xs has already been freed by bond_ipsec_del_sa_all() due to migration, and the active slave has changed to a new device. At the same time, the xs is marked as DEAD due to the XFRM entry is removed, triggering xfrm_state_gc_task() and bond_ipsec_free_sa(). In all three cases, xdo_dev_state_free() should not be called, only xs should be removed from bond->ipsec list. At the same time, protect bond_ipsec_del_sa_all and bond_ipsec_add_sa_all with x->lock for each xs being processed. This prevents XFRM from concurrently initiating add/delete operations on the managed states. Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") Reported-by: Jakub Kicinski <kuba@kernel.org> Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org Suggested-by: Cosmin Ratiu <cratiu@nvidia.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> --- drivers/net/bonding/bond_main.c | 53 +++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-)