diff mbox series

[rdma-rc,4/9] IB/ipoib: Fix double free of skb in case of multicast traffic in CM mode

Message ID 20200212072635.682689-5-leon@kernel.org (mailing list archive)
State Changes Requested
Headers show
Series Fixes for v5.6 | expand

Commit Message

Leon Romanovsky Feb. 12, 2020, 7:26 a.m. UTC
From: Valentine Fatiev <valentinef@mellanox.com>

While connected mode set and we have connected and datagram traffic
in parallel it might end with double free of datagram skb.

Current mechanism assumes that order in completion queue is the same as
theorder of sent packets for all QPs. Order is kept only for specific QP,
in case of mixed UD and CM traffic we have few QPs(one UD and few CM's)
in parallel.

The problem:
----------------------------------------------------------

Transmit queue:
-----------------
UD skb pointer kept in queue itself , CM skb kept in spearate queue and
use transmit queue as a placeholder to count number of transmitted packets.

0   1   2   3   4  5  6  7  8   9  10  11 12 13 .........127
------------------------------------------------------------
NL ud1 UD2 CM1 ud3 cm2 cm3 ud4 cm4 ud5 NL NL NL ...........
--------------------------------------------------------------
    ^                                  ^
   tail                               head

Completion queue(problematic scenario) - order not the same as in
transmit queue

  1  2  3  4  5  6  7  8  9
----------------------------
 ud1 CM1 UD2 ud3 cm2 cm3 ud4 cm4 ud5
----------------------------

1. CM1 'wc' processing
   - skb freed in cm separate ring.
   - tx_tail of transmit queue increased although UD2 not freed.
     Now driver assume UD2 index is already freed and it could be used
     for new transmitted skb.

0   1   2   3   4  5  6  7  8   9  10  11 12 13 .........127
------------------------------------------------------------
NL NL  UD2 CM1 ud3 cm2 cm3 ud4 cm4 ud5 NL NL NL ...........
--------------------------------------------------------------
        ^   ^                       ^
      (Bad)tail                    head
(Bad - Could be used for new SKB)

In this case (due to heavy load) UD2 skb pointer could be replaced by
new transmitted packet UD_NEW , as driver assume its free.
At this point we will have to process two 'wc' with same index but we
have only one pointer to free.
During second attempt to free same skb we will have NULL pointer exception.

2. UD2 'wc' processing
   - skb freed according the index we got from 'wc' ,but
     it was already overwritten by mistake.So actually skb that was
     released its skb of new transmitted packet and no the original one.

3. UD_NEW 'wc' processing
   - attempt to free already freed skb. NUll pointer exception.

Fix:
-----------------------------------------------------------------------
The fix is to stop using UD ring also as placeholder for CM packets,
instead using atomic counter to keep total packets sent value and use it
to stop/wake queue.

Fixes: 2c104ea68350 ("IB/ipoib: Get rid of the tx_outstanding variable in all modes")
Signed-off-by: Valentine Fatiev <valentinef@mellanox.com>
Reviewed-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h      |  1 +
 drivers/infiniband/ulp/ipoib/ipoib_cm.c   | 15 ++++++++-------
 drivers/infiniband/ulp/ipoib/ipoib_ib.c   |  8 ++++++--
 drivers/infiniband/ulp/ipoib/ipoib_main.c |  1 +
 4 files changed, 16 insertions(+), 9 deletions(-)

Comments

Jason Gunthorpe Feb. 13, 2020, 3:37 p.m. UTC | #1
On Wed, Feb 12, 2020 at 09:26:30AM +0200, Leon Romanovsky wrote:

> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> index 2aa3457a30ce..c614cb87d09b 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -379,6 +379,7 @@ struct ipoib_dev_priv {
>  	struct ipoib_tx_buf *tx_ring;
>  	unsigned int	     tx_head;
>  	unsigned int	     tx_tail;
> +	atomic_t             tx_outstanding;
>  	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
>  	struct ib_ud_wr      tx_wr;
>  	struct ib_wc	     send_wc[MAX_SEND_CQE];
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index c59e00a0881f..db6aace83fe5 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -756,7 +756,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
>  		return;
>  	}
>  
> -	if ((priv->tx_head - priv->tx_tail) == ipoib_sendq_size - 1) {
> +	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {
>  		ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
>  			  tx->qp->qp_num);
>  		netif_stop_queue(dev);
> @@ -786,7 +786,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
>  	} else {
>  		netif_trans_update(dev);
>  		++tx->tx_head;
> -		++priv->tx_head;
> +		atomic_inc(&priv->tx_outstanding);
>  	}

This use of an atomic is very weird, probably wrong.

Why is it an atomic?  priv->tx_head wasn't an atomic, and every place
touching tx_outstanding was also touching tx_head.

I assume there is some hidden locking here? Or much more stuff is
busted up.

In that case, drop the atomic.

However, if the atomic is needed (where/why?) then something has to
be dealing with the races, and if the write side is fully locked then
an atomic is the wrong choice, use READ_ONCE/WRITE_ONCE instead

Jason
Leon Romanovsky Feb. 13, 2020, 6:10 p.m. UTC | #2
On Thu, Feb 13, 2020 at 11:37:43AM -0400, Jason Gunthorpe wrote:
> On Wed, Feb 12, 2020 at 09:26:30AM +0200, Leon Romanovsky wrote:
>
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> > index 2aa3457a30ce..c614cb87d09b 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > @@ -379,6 +379,7 @@ struct ipoib_dev_priv {
> >  	struct ipoib_tx_buf *tx_ring;
> >  	unsigned int	     tx_head;
> >  	unsigned int	     tx_tail;
> > +	atomic_t             tx_outstanding;
> >  	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
> >  	struct ib_ud_wr      tx_wr;
> >  	struct ib_wc	     send_wc[MAX_SEND_CQE];
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > index c59e00a0881f..db6aace83fe5 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > @@ -756,7 +756,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> >  		return;
> >  	}
> >
> > -	if ((priv->tx_head - priv->tx_tail) == ipoib_sendq_size - 1) {
> > +	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {
> >  		ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
> >  			  tx->qp->qp_num);
> >  		netif_stop_queue(dev);
> > @@ -786,7 +786,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> >  	} else {
> >  		netif_trans_update(dev);
> >  		++tx->tx_head;
> > -		++priv->tx_head;
> > +		atomic_inc(&priv->tx_outstanding);
> >  	}
>
> This use of an atomic is very weird, probably wrong.
>
> Why is it an atomic?  priv->tx_head wasn't an atomic, and every place
> touching tx_outstanding was also touching tx_head.
>
> I assume there is some hidden locking here? Or much more stuff is
> busted up.
>
> In that case, drop the atomic.
>
> However, if the atomic is needed (where/why?) then something has to
> be dealing with the races, and if the write side is fully locked then
> an atomic is the wrong choice, use READ_ONCE/WRITE_ONCE instead

I thought that atomic_t is appropriate here. Valentin wanted to
implement "window" and he needed to ensure that this window is counted
correctly while it doesn't need to be 100% accurate.

Thanks

>
> Jason
Jason Gunthorpe Feb. 13, 2020, 6:26 p.m. UTC | #3
On Thu, Feb 13, 2020 at 08:10:12PM +0200, Leon Romanovsky wrote:
> On Thu, Feb 13, 2020 at 11:37:43AM -0400, Jason Gunthorpe wrote:
> > On Wed, Feb 12, 2020 at 09:26:30AM +0200, Leon Romanovsky wrote:
> >
> > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > index 2aa3457a30ce..c614cb87d09b 100644
> > > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > @@ -379,6 +379,7 @@ struct ipoib_dev_priv {
> > >  	struct ipoib_tx_buf *tx_ring;
> > >  	unsigned int	     tx_head;
> > >  	unsigned int	     tx_tail;
> > > +	atomic_t             tx_outstanding;
> > >  	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
> > >  	struct ib_ud_wr      tx_wr;
> > >  	struct ib_wc	     send_wc[MAX_SEND_CQE];
> > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > index c59e00a0881f..db6aace83fe5 100644
> > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > @@ -756,7 +756,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> > >  		return;
> > >  	}
> > >
> > > -	if ((priv->tx_head - priv->tx_tail) == ipoib_sendq_size - 1) {
> > > +	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {
> > >  		ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
> > >  			  tx->qp->qp_num);
> > >  		netif_stop_queue(dev);
> > > @@ -786,7 +786,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> > >  	} else {
> > >  		netif_trans_update(dev);
> > >  		++tx->tx_head;
> > > -		++priv->tx_head;
> > > +		atomic_inc(&priv->tx_outstanding);
> > >  	}
> >
> > This use of an atomic is very weird, probably wrong.
> >
> > Why is it an atomic?  priv->tx_head wasn't an atomic, and every place
> > touching tx_outstanding was also touching tx_head.
> >
> > I assume there is some hidden locking here? Or much more stuff is
> > busted up.
> >
> > In that case, drop the atomic.
> >
> > However, if the atomic is needed (where/why?) then something has to
> > be dealing with the races, and if the write side is fully locked then
> > an atomic is the wrong choice, use READ_ONCE/WRITE_ONCE instead
> 
> I thought that atomic_t is appropriate here. Valentin wanted to
> implement "window" and he needed to ensure that this window is counted
> correctly while it doesn't need to be 100% accurate.

It does need to be 100% accurate because the stop_queue condition is:

 +	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {

And presumably the whole problem here is overflowing some sized q
opbject.

That atomic is only needed if there is concurrency, and where is the
concurrency?

Jason
Leon Romanovsky Feb. 13, 2020, 6:36 p.m. UTC | #4
On Thu, Feb 13, 2020 at 02:26:55PM -0400, Jason Gunthorpe wrote:
> On Thu, Feb 13, 2020 at 08:10:12PM +0200, Leon Romanovsky wrote:
> > On Thu, Feb 13, 2020 at 11:37:43AM -0400, Jason Gunthorpe wrote:
> > > On Wed, Feb 12, 2020 at 09:26:30AM +0200, Leon Romanovsky wrote:
> > >
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > index 2aa3457a30ce..c614cb87d09b 100644
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > @@ -379,6 +379,7 @@ struct ipoib_dev_priv {
> > > >  	struct ipoib_tx_buf *tx_ring;
> > > >  	unsigned int	     tx_head;
> > > >  	unsigned int	     tx_tail;
> > > > +	atomic_t             tx_outstanding;
> > > >  	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
> > > >  	struct ib_ud_wr      tx_wr;
> > > >  	struct ib_wc	     send_wc[MAX_SEND_CQE];
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > index c59e00a0881f..db6aace83fe5 100644
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > @@ -756,7 +756,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> > > >  		return;
> > > >  	}
> > > >
> > > > -	if ((priv->tx_head - priv->tx_tail) == ipoib_sendq_size - 1) {
> > > > +	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {
> > > >  		ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
> > > >  			  tx->qp->qp_num);
> > > >  		netif_stop_queue(dev);
> > > > @@ -786,7 +786,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> > > >  	} else {
> > > >  		netif_trans_update(dev);
> > > >  		++tx->tx_head;
> > > > -		++priv->tx_head;
> > > > +		atomic_inc(&priv->tx_outstanding);
> > > >  	}
> > >
> > > This use of an atomic is very weird, probably wrong.
> > >
> > > Why is it an atomic?  priv->tx_head wasn't an atomic, and every place
> > > touching tx_outstanding was also touching tx_head.
> > >
> > > I assume there is some hidden locking here? Or much more stuff is
> > > busted up.
> > >
> > > In that case, drop the atomic.
> > >
> > > However, if the atomic is needed (where/why?) then something has to
> > > be dealing with the races, and if the write side is fully locked then
> > > an atomic is the wrong choice, use READ_ONCE/WRITE_ONCE instead
> >
> > I thought that atomic_t is appropriate here. Valentin wanted to
> > implement "window" and he needed to ensure that this window is counted
> > correctly while it doesn't need to be 100% accurate.
>
> It does need to be 100% accurate because the stop_queue condition is:
>
>  +	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {

So better to write ">=" instead of "=".

>
> And presumably the whole problem here is overflowing some sized q
> opbject.
>
> That atomic is only needed if there is concurrency, and where is the
> concurrency?

Depends on if you believe to description in commit message :)

>
> Jason
Jason Gunthorpe Feb. 13, 2020, 7:09 p.m. UTC | #5
On Thu, Feb 13, 2020 at 08:36:22PM +0200, Leon Romanovsky wrote:
> On Thu, Feb 13, 2020 at 02:26:55PM -0400, Jason Gunthorpe wrote:
> > On Thu, Feb 13, 2020 at 08:10:12PM +0200, Leon Romanovsky wrote:
> > > On Thu, Feb 13, 2020 at 11:37:43AM -0400, Jason Gunthorpe wrote:
> > > > On Wed, Feb 12, 2020 at 09:26:30AM +0200, Leon Romanovsky wrote:
> > > >
> > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > > index 2aa3457a30ce..c614cb87d09b 100644
> > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > > @@ -379,6 +379,7 @@ struct ipoib_dev_priv {
> > > > >  	struct ipoib_tx_buf *tx_ring;
> > > > >  	unsigned int	     tx_head;
> > > > >  	unsigned int	     tx_tail;
> > > > > +	atomic_t             tx_outstanding;
> > > > >  	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
> > > > >  	struct ib_ud_wr      tx_wr;
> > > > >  	struct ib_wc	     send_wc[MAX_SEND_CQE];
> > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > > index c59e00a0881f..db6aace83fe5 100644
> > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > > @@ -756,7 +756,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> > > > >  		return;
> > > > >  	}
> > > > >
> > > > > -	if ((priv->tx_head - priv->tx_tail) == ipoib_sendq_size - 1) {
> > > > > +	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {
> > > > >  		ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
> > > > >  			  tx->qp->qp_num);
> > > > >  		netif_stop_queue(dev);
> > > > > @@ -786,7 +786,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> > > > >  	} else {
> > > > >  		netif_trans_update(dev);
> > > > >  		++tx->tx_head;
> > > > > -		++priv->tx_head;
> > > > > +		atomic_inc(&priv->tx_outstanding);
> > > > >  	}
> > > >
> > > > This use of an atomic is very weird, probably wrong.
> > > >
> > > > Why is it an atomic?  priv->tx_head wasn't an atomic, and every place
> > > > touching tx_outstanding was also touching tx_head.
> > > >
> > > > I assume there is some hidden locking here? Or much more stuff is
> > > > busted up.
> > > >
> > > > In that case, drop the atomic.
> > > >
> > > > However, if the atomic is needed (where/why?) then something has to
> > > > be dealing with the races, and if the write side is fully locked then
> > > > an atomic is the wrong choice, use READ_ONCE/WRITE_ONCE instead
> > >
> > > I thought that atomic_t is appropriate here. Valentin wanted to
> > > implement "window" and he needed to ensure that this window is counted
> > > correctly while it doesn't need to be 100% accurate.
> >
> > It does need to be 100% accurate because the stop_queue condition is:
> >
> >  +	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {
> 
> So better to write ">=" instead of "=".

Then you risk overflowing the send q.

The required scheme appears to be to globally bound the # of packets
outstanding to tx such that the global bound is lower than any of the
(many) QP's sendq, thus there will always be a place to put the
packets.

So it must be exact or at least pessimistic.

This isn't a multiq netdev, so AFAICT, the tx side is single threaded
by netdev, and it looks like the wc side was single threaded by NAPI
as well.

The old code used two variable, each written in a single threaded way,
and then read to generate the state. With the missing READ_ONCE added,
it should look like this:

    if ((priv->tx_head - READ_ONCE(priv->tx_tail)) == ipoib_sendq_size - 1) {

I feel like sticking with the single writer packets posted/completed
scheme is better and clearer than trying to collapse it into a single
atomic.

> > And presumably the whole problem here is overflowing some sized q
> > opbject.
> >
> > That atomic is only needed if there is concurrency, and where is the
> > concurrency?
> 
> Depends on if you believe to description in commit message :)

I do.. It is obviously wrong now that you point to it :) Each QP must
have exclusive control of its head/tail pointers, having a CM QP reach
into the head/tail of the UD QP is certainly wrong.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 2aa3457a30ce..c614cb87d09b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -379,6 +379,7 @@  struct ipoib_dev_priv {
 	struct ipoib_tx_buf *tx_ring;
 	unsigned int	     tx_head;
 	unsigned int	     tx_tail;
+	atomic_t             tx_outstanding;
 	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
 	struct ib_ud_wr      tx_wr;
 	struct ib_wc	     send_wc[MAX_SEND_CQE];
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index c59e00a0881f..db6aace83fe5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -756,7 +756,7 @@  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 		return;
 	}
 
-	if ((priv->tx_head - priv->tx_tail) == ipoib_sendq_size - 1) {
+	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {
 		ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
 			  tx->qp->qp_num);
 		netif_stop_queue(dev);
@@ -786,7 +786,7 @@  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 	} else {
 		netif_trans_update(dev);
 		++tx->tx_head;
-		++priv->tx_head;
+		atomic_inc(&priv->tx_outstanding);
 	}
 }
 
@@ -820,11 +820,11 @@  void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	netif_tx_lock(dev);
 
 	++tx->tx_tail;
-	++priv->tx_tail;
+	atomic_dec(&priv->tx_outstanding);
 
 	if (unlikely(netif_queue_stopped(dev) &&
-		     (priv->tx_head - priv->tx_tail) <= ipoib_sendq_size >> 1 &&
-		     test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
+	    (atomic_read(&priv->tx_outstanding) <= ipoib_sendq_size >> 1) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
 		netif_wake_queue(dev);
 
 	if (wc->status != IB_WC_SUCCESS &&
@@ -1232,8 +1232,9 @@  static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 		dev_kfree_skb_any(tx_req->skb);
 		netif_tx_lock_bh(p->dev);
 		++p->tx_tail;
-		++priv->tx_tail;
-		if (unlikely(priv->tx_head - priv->tx_tail == ipoib_sendq_size >> 1) &&
+		atomic_dec(&priv->tx_outstanding);
+		if (unlikely(atomic_read(&priv->tx_outstanding) <=
+			     ipoib_sendq_size >> 1) &&
 		    netif_queue_stopped(p->dev) &&
 		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 			netif_wake_queue(p->dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index c332b4761816..0652f25e8d0f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -407,9 +407,11 @@  static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	dev_kfree_skb_any(tx_req->skb);
 
 	++priv->tx_tail;
+	atomic_dec(&priv->tx_outstanding);
 
 	if (unlikely(netif_queue_stopped(dev) &&
-		     ((priv->tx_head - priv->tx_tail) <= ipoib_sendq_size >> 1) &&
+		     (atomic_read(&priv->tx_outstanding) <= ipoib_sendq_size >>
+		      1) &&
 		     test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
 		netif_wake_queue(dev);
 
@@ -634,7 +636,7 @@  int ipoib_send(struct net_device *dev, struct sk_buff *skb,
 	else
 		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
 	/* increase the tx_head after send success, but use it for queue state */
-	if (priv->tx_head - priv->tx_tail == ipoib_sendq_size - 1) {
+	if (atomic_read(&priv->tx_outstanding) == ipoib_sendq_size - 1) {
 		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
 		netif_stop_queue(dev);
 	}
@@ -662,6 +664,7 @@  int ipoib_send(struct net_device *dev, struct sk_buff *skb,
 
 		rc = priv->tx_head;
 		++priv->tx_head;
+		atomic_inc(&priv->tx_outstanding);
 	}
 	return rc;
 }
@@ -807,6 +810,7 @@  int ipoib_ib_dev_stop_default(struct net_device *dev)
 				ipoib_dma_unmap_tx(priv, tx_req);
 				dev_kfree_skb_any(tx_req->skb);
 				++priv->tx_tail;
+				atomic_dec(&priv->tx_outstanding);
 			}
 
 			for (i = 0; i < ipoib_recvq_size; ++i) {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 4a0d3a9e72e1..0d98886cfef8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1706,6 +1706,7 @@  static int ipoib_dev_init_default(struct net_device *dev)
 	}
 
 	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
+	atomic_set(&priv->tx_outstanding, 0);
 
 	if (ipoib_transport_dev_init(dev, priv->ca)) {
 		pr_warn("%s: ipoib_transport_dev_init failed\n",