diff mbox

[v2] IB/ipoib: fix for rare multicast join race condition.

Message ID 20160206135041.11630.77019.stgit@phlsvlogin03.ph.intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Estrin, Alex Feb. 6, 2016, 1:50 p.m. UTC
A narrow window for race condition still exist between
multicast join thread and *dev_flush workers.
A kernel crash caused by prolong erratic link state changes
was observed (most likely a faulty cabling):

[167275.656270] BUG: unable to handle kernel NULL pointer dereference at
0000000000000020
[167275.665973] IP: [<ffffffffa05f8f2e>] ipoib_mcast_join+0xae/0x1d0 [ib_ipoib]
[167275.674443] PGD 0
[167275.677373] Oops: 0000 [#1] SMP
...
[167275.977530] Call Trace:
[167275.982225]  [<ffffffffa05f92f0>] ? ipoib_mcast_free+0x200/0x200 [ib_ipoib]
[167275.992024]  [<ffffffffa05fa1b7>] ipoib_mcast_join_task+0x2a7/0x490
[ib_ipoib]
[167276.002149]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
[167276.010754]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
[167276.019088]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
[167276.027737]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
Here was a hit spot:
ipoib_mcast_join() {
..............
      rec.qkey      = priv->broadcast->mcmember.qkey;
                                       ^^^^^^^
.....
 }
Proposed patch should prevent multicast join task to continue
if link state change is detected.

Signed-off-by: Alex Estrin <alex.estrin@intel.com>
---

Changes from v1:
No need to lock again if error detected.
---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   14 ++++++++++++--
 1 files changed, 12 insertions(+), 2 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Leon Romanovsky Feb. 6, 2016, 5:18 p.m. UTC | #1
A number of very minor comments:
1. Please remove dot at the end of commit message title.

On Sat, Feb 06, 2016 at 08:50:41AM -0500, Alex Estrin wrote:
> A narrow window for race condition still exist between
> multicast join thread and *dev_flush workers.
> A kernel crash caused by prolong erratic link state changes
> was observed (most likely a faulty cabling):
> 
> [167275.656270] BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000020
> [167275.665973] IP: [<ffffffffa05f8f2e>] ipoib_mcast_join+0xae/0x1d0 [ib_ipoib]
> [167275.674443] PGD 0
> [167275.677373] Oops: 0000 [#1] SMP
> ...
> [167275.977530] Call Trace:
> [167275.982225]  [<ffffffffa05f92f0>] ? ipoib_mcast_free+0x200/0x200 [ib_ipoib]
> [167275.992024]  [<ffffffffa05fa1b7>] ipoib_mcast_join_task+0x2a7/0x490
> [ib_ipoib]
> [167276.002149]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
> [167276.010754]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
> [167276.019088]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
> [167276.027737]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
> Here was a hit spot:
> ipoib_mcast_join() {
> ..............
>       rec.qkey      = priv->broadcast->mcmember.qkey;
>                                        ^^^^^^^
> .....
>  }
> Proposed patch should prevent multicast join task to continue
> if link state change is detected.
> 
> Signed-off-by: Alex Estrin <alex.estrin@intel.com>
> ---
> 
> Changes from v1:
> No need to lock again if error detected.
> ---
>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   14 ++++++++++++--
>  1 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> index 050dfa1..3ce2f0a 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> @@ -456,7 +456,7 @@ out_locked:
>  	return status;
>  }
>  
> -static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
> +static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ib_sa_multicast *multicast;
> @@ -466,6 +466,9 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  	ib_sa_comp_mask comp_mask;
>  	int ret = 0;
>  
> +	if (!priv->broadcast)
> +		return -EINVAL;
> +
>  	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
>  
>  	rec.mgid     = mcast->mcmember.mgid;
> @@ -539,6 +542,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  		spin_unlock_irq(&priv->lock);
>  		complete(&mcast->done);
>  	}
> +	return 0;

2. This function declared as a void, it is an error to add "return 0".

>  }
>  
>  void ipoib_mcast_join_task(struct work_struct *work)
> @@ -611,6 +615,11 @@ void ipoib_mcast_join_task(struct work_struct *work)
>  	 * and attached
>  	 */
>  	list_for_each_entry(mcast, &priv->multicast_list, list) {
> +		if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
> +			mcast = NULL;
> +			delay_until = 0;
> +			goto out;
> +		}
>  		if (IS_ERR_OR_NULL(mcast->mc) &&
>  		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
>  		    (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ||
> @@ -621,7 +630,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
>  				init_completion(&mcast->done);
>  				set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
>  				spin_unlock_irq(&priv->lock);
> -				ipoib_mcast_join(dev, mcast);
> +				if (ipoib_mcast_join(dev, mcast) != 0)

3. There is no need to compare with 0. Just use "if (ipoib...(...))"
construction.

> +					return;
>  				spin_lock_irq(&priv->lock);
>  			} else if (!delay_until ||
>  				 time_before(mcast->delay_until, delay_until))
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Estrin, Alex Feb. 8, 2016, 12:23 p.m. UTC | #2
Hi Leon,

> A number of very minor comments:
> 1. Please remove dot at the end of commit message title.
> 
Ok. Thanks.

> 2. This function declared as a void, it is an error to add "return 0".
> 
ipoib_mcast_join() is declared as int.  Please see a very first change.

> -static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
> +static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)

> 3. There is no need to compare with 0. Just use "if (ipoib...(...))"
> construction.
Ok. No problem.

Thanks,
Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Feb. 8, 2016, 1:22 p.m. UTC | #3
On Mon, Feb 08, 2016 at 12:23:07PM +0000, Estrin, Alex wrote:
> > 2. This function declared as a void, it is an error to add "return 0".
> > 
> ipoib_mcast_join() is declared as int.  Please see a very first change.
> 
> > -static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
> > +static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
> 

You are right.
Thanks
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Erez Shitrit Feb. 8, 2016, 1:34 p.m. UTC | #4
On Sat, Feb 6, 2016 at 3:50 PM, Alex Estrin <alex.estrin@intel.com> wrote:
> A narrow window for race condition still exist between
> multicast join thread and *dev_flush workers.
> A kernel crash caused by prolong erratic link state changes
> was observed (most likely a faulty cabling):
>
> [167275.656270] BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000020
> [167275.665973] IP: [<ffffffffa05f8f2e>] ipoib_mcast_join+0xae/0x1d0 [ib_ipoib]
> [167275.674443] PGD 0
> [167275.677373] Oops: 0000 [#1] SMP
> ...
> [167275.977530] Call Trace:
> [167275.982225]  [<ffffffffa05f92f0>] ? ipoib_mcast_free+0x200/0x200 [ib_ipoib]
> [167275.992024]  [<ffffffffa05fa1b7>] ipoib_mcast_join_task+0x2a7/0x490
> [ib_ipoib]
> [167276.002149]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
> [167276.010754]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
> [167276.019088]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
> [167276.027737]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
> Here was a hit spot:
> ipoib_mcast_join() {
> ..............
>       rec.qkey      = priv->broadcast->mcmember.qkey;
>                                        ^^^^^^^
> .....
>  }
> Proposed patch should prevent multicast join task to continue
> if link state change is detected.
>
> Signed-off-by: Alex Estrin <alex.estrin@intel.com>
> ---
>
> Changes from v1:
> No need to lock again if error detected.
> ---
>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   14 ++++++++++++--
>  1 files changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> index 050dfa1..3ce2f0a 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> @@ -456,7 +456,7 @@ out_locked:
>         return status;
>  }
>
> -static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
> +static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  {
>         struct ipoib_dev_priv *priv = netdev_priv(dev);
>         struct ib_sa_multicast *multicast;
> @@ -466,6 +466,9 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>         ib_sa_comp_mask comp_mask;
>         int ret = 0;
>
> +       if (!priv->broadcast)
> +               return -EINVAL;
> +

Can you please elaborate what will avoid the other task that
invalidate the priv->broadcast (ipoib_mcast_dev_flush) to do it right
after that check?

>         ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
>
>         rec.mgid     = mcast->mcmember.mgid;
> @@ -539,6 +542,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>                 spin_unlock_irq(&priv->lock);
>                 complete(&mcast->done);
>         }
> +       return 0;
>  }
>
>  void ipoib_mcast_join_task(struct work_struct *work)
> @@ -611,6 +615,11 @@ void ipoib_mcast_join_task(struct work_struct *work)
>          * and attached
>          */
>         list_for_each_entry(mcast, &priv->multicast_list, list) {
> +               if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
> +                       mcast = NULL;
> +                       delay_until = 0;
> +                       goto out;
> +               }
>                 if (IS_ERR_OR_NULL(mcast->mc) &&
>                     !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
>                     (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ||
> @@ -621,7 +630,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
>                                 init_completion(&mcast->done);
>                                 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
>                                 spin_unlock_irq(&priv->lock);
> -                               ipoib_mcast_join(dev, mcast);
> +                               if (ipoib_mcast_join(dev, mcast) != 0)
> +                                       return;
>                                 spin_lock_irq(&priv->lock);
>                         } else if (!delay_until ||
>                                  time_before(mcast->delay_until, delay_until))
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Estrin, Alex Feb. 8, 2016, 4:43 p.m. UTC | #5
SGkgRXJleiwNCiANCj4gQ2FuIHlvdSBwbGVhc2UgZWxhYm9yYXRlIHdoYXQgd2lsbCBhdm9pZCB0
aGUgb3RoZXIgdGFzayB0aGF0DQo+IGludmFsaWRhdGUgdGhlIHByaXYtPmJyb2FkY2FzdCAoaXBv
aWJfbWNhc3RfZGV2X2ZsdXNoKSB0byBkbyBpdCByaWdodA0KPiBhZnRlciB0aGF0IGNoZWNrPw0K
DQpJIHdhcyBjb25zaWRlcmluZyBjaGVjayBmb3IgSVBPSUJfRkxBR19PUEVSX1VQIGluIG1jYXN0
IHRhc2sgbG9vcCANCndvdWxkIGJlIHN1ZmZpY2llbnQgYXMgaXRzIHN0YXRlIGlzIHNlcmlhbGl6
ZWQgIGJ5IHByaXYtPmxvY2s6DQo+ID4gICAgICAgICBsaXN0X2Zvcl9lYWNoX2VudHJ5KG1jYXN0
LCAmcHJpdi0+bXVsdGljYXN0X2xpc3QsIGxpc3QpIHsNCj4gPiArICAgICAgICAgICAgICAgaWYg
KCF0ZXN0X2JpdChJUE9JQl9GTEFHX09QRVJfVVAsICZwcml2LT5mbGFncykpIHsNCg0KQW5kIEkg
Y2FuIHNlZSB5b3VyIHBvaW50IG5vdy4gV2UgdW5sb2NrIGJlZm9yZSBjYWxsaW5nIG1jYXN0X2pv
aW4oKS4NCkFwcGFyZW50bHkgY2hlY2sgZm9yIE9QRVJfVVAgZmxhZyBzaG91bGQgYmUgYWRkZWQg
YWxvbmcgcHJpdi0+YnJvYWRjYXN0IGNoZWNrDQphcyBpdCB3aWxsIGVuc3VyZSBpbmRpY2F0aW9u
IGlmIG1jYXN0IHdvcmtlciBpcyBjb21wZXRpbmcgd2l0aCBldmVudCBoYW5kbGVyIHdvcmtlci4N
Cg0KVGhhbmtzLA0KQWxleC4NCg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Feb. 9, 2016, 7:18 a.m. UTC | #6
On Mon, Feb 08, 2016 at 04:43:29PM +0000, Estrin, Alex wrote:
> Hi Erez,
>  
> > Can you please elaborate what will avoid the other task that
> > invalidate the priv->broadcast (ipoib_mcast_dev_flush) to do it right
> > after that check?
> 
> I was considering check for IPOIB_FLAG_OPER_UP in mcast task loop 
> would be sufficient as its state is serialized  by priv->lock:
> > >         list_for_each_entry(mcast, &priv->multicast_list, list) {
> > > +               if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
> 
> And I can see your point now. We unlock before calling mcast_join().
> Apparently check for OPER_UP flag should be added along priv->broadcast check
> as it will ensure indication if mcast worker is competing with event handler worker.

Will you plan to respin it?

> 
> Thanks,
> Alex.
> 
> N?????r??y????b?X???v?^?)?{.n?+????{????{ay????,j??f???h???z??w??????j:+v???w?j?m????????zZ+??????j"??!
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Estrin, Alex Feb. 9, 2016, 12:06 p.m. UTC | #7
Hi Leon,

> Will you plan to respin it?


Yes, I'll resubmit shortly.

Thanks,
Alex.
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 050dfa1..3ce2f0a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -456,7 +456,7 @@  out_locked:
 	return status;
 }
 
-static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
+static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_sa_multicast *multicast;
@@ -466,6 +466,9 @@  static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 	ib_sa_comp_mask comp_mask;
 	int ret = 0;
 
+	if (!priv->broadcast)
+		return -EINVAL;
+
 	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
 
 	rec.mgid     = mcast->mcmember.mgid;
@@ -539,6 +542,7 @@  static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 		spin_unlock_irq(&priv->lock);
 		complete(&mcast->done);
 	}
+	return 0;
 }
 
 void ipoib_mcast_join_task(struct work_struct *work)
@@ -611,6 +615,11 @@  void ipoib_mcast_join_task(struct work_struct *work)
 	 * and attached
 	 */
 	list_for_each_entry(mcast, &priv->multicast_list, list) {
+		if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+			mcast = NULL;
+			delay_until = 0;
+			goto out;
+		}
 		if (IS_ERR_OR_NULL(mcast->mc) &&
 		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
 		    (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ||
@@ -621,7 +630,8 @@  void ipoib_mcast_join_task(struct work_struct *work)
 				init_completion(&mcast->done);
 				set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
 				spin_unlock_irq(&priv->lock);
-				ipoib_mcast_join(dev, mcast);
+				if (ipoib_mcast_join(dev, mcast) != 0)
+					return;
 				spin_lock_irq(&priv->lock);
 			} else if (!delay_until ||
 				 time_before(mcast->delay_until, delay_until))