[v2,rdma-next,1/2] RDMA/qedr: Fix synchronization methods and memory leaks in qedr
diff mbox series

Message ID 20191016114242.10736-2-michal.kalderon@marvell.com
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series
  • RDMA/qedr: Fix memory leaks and synchronization
Related show

Commit Message

Michal Kalderon Oct. 16, 2019, 11:42 a.m. UTC
Re-design of the iWARP CM related objects reference
counting and synchronization methods, to ensure operations
are synchronized correctly and and that memory allocated for "ep"
is released. (was leaked).
Also makes sure QP memory is not released before ep is finished
accessing it.

Where as the QP object is created/destroyed by external operations,
the ep is created/destroyed by internal operations and represents
the tcp connection associated with the QP.

QP destruction flow:
- needs to wait for ep establishment to complete (either successfully
  or with error)
- needs to wait for ep disconnect to be fully posted to avoid a
  race condition of disconnect being called after reset.
- both the operations above don't always happen, so we use atomic
  flags to indicate whether the qp destruction flow needs to wait
  for these completions or not, if the destroy is called before
  these operations began, the flows will check the flags and not
  execute them ( connect / disconnect).

We use completion structure for waiting for the completions mentioned
above.

The QP refcnt was modified to kref object.
The EP has a kref added to it to handle additional worker thread
accessing it.

Memory Leaks - https://www.spinics.net/lists/linux-rdma/msg83762.html
Reported-by Chuck Lever <chuck.lever@oracle.com>

Concurrency not managed correctly -
https://www.spinics.net/lists/linux-rdma/msg67949.html
Reported-by Jason Gunthorpe <jgg@ziepe.ca>

Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
---
 drivers/infiniband/hw/qedr/qedr.h       |  23 ++++-
 drivers/infiniband/hw/qedr/qedr_iw_cm.c | 150 ++++++++++++++++++++++----------
 drivers/infiniband/hw/qedr/verbs.c      |  42 +++++----
 3 files changed, 141 insertions(+), 74 deletions(-)

Comments

Jason Gunthorpe Oct. 22, 2019, 7:36 p.m. UTC | #1
On Wed, Oct 16, 2019 at 02:42:41PM +0300, Michal Kalderon wrote:
> Re-design of the iWARP CM related objects reference
> counting and synchronization methods, to ensure operations
> are synchronized correctly and and that memory allocated for "ep"
> is released. (was leaked).
> Also makes sure QP memory is not released before ep is finished
> accessing it.
> 
> Where as the QP object is created/destroyed by external operations,
> the ep is created/destroyed by internal operations and represents
> the tcp connection associated with the QP.
> 
> QP destruction flow:
> - needs to wait for ep establishment to complete (either successfully
>   or with error)
> - needs to wait for ep disconnect to be fully posted to avoid a
>   race condition of disconnect being called after reset.
> - both the operations above don't always happen, so we use atomic
>   flags to indicate whether the qp destruction flow needs to wait
>   for these completions or not, if the destroy is called before
>   these operations began, the flows will check the flags and not
>   execute them ( connect / disconnect).
> 
> We use completion structure for waiting for the completions mentioned
> above.
> 
> The QP refcnt was modified to kref object.
> The EP has a kref added to it to handle additional worker thread
> accessing it.
> 
> Memory Leaks - https://www.spinics.net/lists/linux-rdma/msg83762.html
> Reported-by Chuck Lever <chuck.lever@oracle.com>
> 
> Concurrency not managed correctly -
> https://www.spinics.net/lists/linux-rdma/msg67949.html
> Reported-by Jason Gunthorpe <jgg@ziepe.ca>
> 
> Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
> Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
>  drivers/infiniband/hw/qedr/qedr.h       |  23 ++++-
>  drivers/infiniband/hw/qedr/qedr_iw_cm.c | 150 ++++++++++++++++++++++----------
>  drivers/infiniband/hw/qedr/verbs.c      |  42 +++++----
>  3 files changed, 141 insertions(+), 74 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
> index 0cfd849b13d6..8e927f6c1520 100644
> +++ b/drivers/infiniband/hw/qedr/qedr.h
> @@ -40,6 +40,7 @@
>  #include <linux/qed/qed_rdma_if.h>
>  #include <linux/qed/qede_rdma.h>
>  #include <linux/qed/roce_common.h>
> +#include <linux/completion.h>
>  #include "qedr_hsi_rdma.h"
>  
>  #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA"
> @@ -377,10 +378,20 @@ enum qedr_qp_err_bitmap {
>  	QEDR_QP_ERR_RQ_PBL_FULL = 32,
>  };
>  
> +enum qedr_qp_create_type {
> +	QEDR_QP_CREATE_NONE,
> +	QEDR_QP_CREATE_USER,
> +	QEDR_QP_CREATE_KERNEL,
> +};
> +
> +enum qedr_iwarp_cm_flags {
> +	QEDR_IWARP_CM_WAIT_FOR_CONNECT    = BIT(0),
> +	QEDR_IWARP_CM_WAIT_FOR_DISCONNECT = BIT(1),
> +};
> +
>  struct qedr_qp {
>  	struct ib_qp ibqp;	/* must be first */
>  	struct qedr_dev *dev;
> -	struct qedr_iw_ep *ep;
>  	struct qedr_qp_hwq_info sq;
>  	struct qedr_qp_hwq_info rq;
>  
> @@ -395,6 +406,7 @@ struct qedr_qp {
>  	u32 id;
>  	struct qedr_pd *pd;
>  	enum ib_qp_type qp_type;
> +	enum qedr_qp_create_type create_type;
>  	struct qed_rdma_qp *qed_qp;
>  	u32 qp_id;
>  	u16 icid;
> @@ -437,8 +449,11 @@ struct qedr_qp {
>  	/* Relevant to qps created from user space only (applications) */
>  	struct qedr_userq usq;
>  	struct qedr_userq urq;
> -	atomic_t refcnt;
> -	bool destroyed;
> +
> +	/* synchronization objects used with iwarp ep */
> +	struct kref refcnt;
> +	struct completion iwarp_cm_comp;
> +	unsigned long iwarp_cm_flags; /* enum iwarp_cm_flags */
>  };
>  
>  struct qedr_ah {
> @@ -531,7 +546,7 @@ struct qedr_iw_ep {
>  	struct iw_cm_id	*cm_id;
>  	struct qedr_qp	*qp;
>  	void		*qed_context;
> -	u8		during_connect;
> +	struct kref	refcnt;
>  };
>  
>  static inline
> diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
> index 22881d4442b9..26204caf0975 100644
> +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
> @@ -79,6 +79,28 @@ qedr_fill_sockaddr6(const struct qed_iwarp_cm_info *cm_info,
>  	}
>  }
>  
> +static void qedr_iw_free_qp(struct kref *ref)
> +{
> +	struct qedr_qp *qp = container_of(ref, struct qedr_qp, refcnt);
> +
> +	xa_erase(&qp->dev->qps, qp->qp_id);

This probably doesn't work right because qp_id is not derived from the
xa_array, but some external entity.

Thus the qp_id should be removed from the xarray and kref put'd right
before the qp_id is deallocated from the external manager.

Ie you want to avoid a race where the qp_id can be re-assigned but the
xarray entry hasn't been freed by its kref yet. Then it would
xa_insert and fail.

Also the xa_insert is probably not supposed to be the _irq version
either.

> @@ -224,13 +249,18 @@ qedr_iw_disconnect_event(void *context,
>  	struct qedr_discon_work *work;
>  	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
>  	struct qedr_dev *dev = ep->dev;
> -	struct qedr_qp *qp = ep->qp;
>  
>  	work = kzalloc(sizeof(*work), GFP_ATOMIC);
>  	if (!work)
>  		return;
>  
> -	qedr_iw_qp_add_ref(&qp->ibqp);
> +	/* We can't get a close event before disconnect, but since
> +	 * we're scheduling a work queue we need to make sure close
> +	 * won't delete the ep, so we increase the refcnt
> +	 */
> +	if (!kref_get_unless_zero(&ep->refcnt))
> +		return;

The unless_zero version should not be used without some kind of
locking like this, if you have a pointer to ep then ep must be a valid
pointer and it is safe to take a kref on it.

If there is a risk it is not valid then this is racy in a way that
only locking can fix, not unless_zero

> @@ -476,6 +508,19 @@ qedr_addr6_resolve(struct qedr_dev *dev,
>  	return rc;
>  }
>  
> +struct qedr_qp *qedr_iw_load_qp(struct qedr_dev *dev, u32 qpn)
> +{
> +	struct qedr_qp *qp;
> +
> +	xa_lock(&dev->qps);
> +	qp = xa_load(&dev->qps, qpn);
> +	if (!qp || !kref_get_unless_zero(&qp->refcnt))
> +		qp = NULL;

See, here is is OK because qp can't be freed under the xa_lock.

However, this unless_zero also will not be needed once the xa_erase is
moved to the right spot.

> +		/* Wait for the connection setup to complete */
> +		if (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT,
> +				     &qp->iwarp_cm_flags))
> +			wait_for_completion(&qp->iwarp_cm_comp);
> +
> +		if (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_DISCONNECT,
> +				     &qp->iwarp_cm_flags))
> +			wait_for_completion(&qp->iwarp_cm_comp);
>  	}

These atomics seem mis-named, and I'm unclear how they can both be
waiting on the same completion?

> @@ -2490,11 +2488,11 @@ int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
>  
>  	qedr_free_qp_resources(dev, qp, udata);
>  
> -	if (atomic_dec_and_test(&qp->refcnt) &&
> -	    rdma_protocol_iwarp(&dev->ibdev, 1)) {
> -		xa_erase_irq(&dev->qps, qp->qp_id);

This is probably too late, it should be done before the qp_id could be
recycled.

Jason
Michal Kalderon Oct. 23, 2019, 9:49 a.m. UTC | #2
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Tuesday, October 22, 2019 10:36 PM
> 
> External Email
> 
> ----------------------------------------------------------------------
> On Wed, Oct 16, 2019 at 02:42:41PM +0300, Michal Kalderon wrote:
> 
> > Re-design of the iWARP CM related objects reference
> 
> > counting and synchronization methods, to ensure operations
> 
> > are synchronized correctly and and that memory allocated for "ep"
> 
> > is released. (was leaked).
> 
> > Also makes sure QP memory is not released before ep is finished
> 
> > accessing it.
> 
> >
> 
> > Where as the QP object is created/destroyed by external operations,
> 
> > the ep is created/destroyed by internal operations and represents
> 
> > the tcp connection associated with the QP.
> 
> >
> 
> > QP destruction flow:
> 
> > - needs to wait for ep establishment to complete (either successfully
> 
> >   or with error)
> 
> > - needs to wait for ep disconnect to be fully posted to avoid a
> 
> >   race condition of disconnect being called after reset.
> 
> > - both the operations above don't always happen, so we use atomic
> 
> >   flags to indicate whether the qp destruction flow needs to wait
> 
> >   for these completions or not, if the destroy is called before
> 
> >   these operations began, the flows will check the flags and not
> 
> >   execute them ( connect / disconnect).
> 
> >
> 
> > We use completion structure for waiting for the completions mentioned
> 
> > above.
> 
> >
> 
> > The QP refcnt was modified to kref object.
> 
> > The EP has a kref added to it to handle additional worker thread
> 
> > accessing it.
> 
> >
> 
> > Memory Leaks - https://urldefense.proofpoint.com/v2/url?u=https-
> 3A__www.spinics.net_lists_linux-
> 2Drdma_msg83762.html&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=7Yun
> NpwaTtA-c31OjGDRlmf54csBCXY_j41vn-0xRz4&m=DfexSyvBpCQbEUyzV-
> q_oGHiJYpLrH4Igtg963UsuZs&s=XNxYMW-
> rrECcE1vRoUReaYZcb01cMCx9X9fs_clAU1Y&e=
> 
> > Reported-by Chuck Lever <chuck.lever@oracle.com>
> 
> >
> 
> > Concurrency not managed correctly -
> 
> > https://urldefense.proofpoint.com/v2/url?u=https-
> 3A__www.spinics.net_lists_linux-
> 2Drdma_msg67949.html&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=7Yun
> NpwaTtA-c31OjGDRlmf54csBCXY_j41vn-0xRz4&m=DfexSyvBpCQbEUyzV-
> q_oGHiJYpLrH4Igtg963UsuZs&s=zgMS6HLPvQCHcCwmmXuA8EqpZV1cX_DL-
> oPqtLJv2vs&e=
> 
> > Reported-by Jason Gunthorpe <jgg@ziepe.ca>
> 
> >
> 
> > Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
> 
> > Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
> 
> >  drivers/infiniband/hw/qedr/qedr.h       |  23 ++++-
> 
> >  drivers/infiniband/hw/qedr/qedr_iw_cm.c | 150
> ++++++++++++++++++++++----------
> 
> >  drivers/infiniband/hw/qedr/verbs.c      |  42 +++++----
> 
> >  3 files changed, 141 insertions(+), 74 deletions(-)
> 
> >
> 
> > diff --git a/drivers/infiniband/hw/qedr/qedr.h
> b/drivers/infiniband/hw/qedr/qedr.h
> 
> > index 0cfd849b13d6..8e927f6c1520 100644
> 
> > +++ b/drivers/infiniband/hw/qedr/qedr.h
> 
> > @@ -40,6 +40,7 @@
> 
> >  #include <linux/qed/qed_rdma_if.h>
> 
> >  #include <linux/qed/qede_rdma.h>
> 
> >  #include <linux/qed/roce_common.h>
> 
> > +#include <linux/completion.h>
> 
> >  #include "qedr_hsi_rdma.h"
> 
> >
> 
> >  #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA"
> 
> > @@ -377,10 +378,20 @@ enum qedr_qp_err_bitmap {
> 
> >  	QEDR_QP_ERR_RQ_PBL_FULL = 32,
> 
> >  };
> 
> >
> 
> > +enum qedr_qp_create_type {
> 
> > +	QEDR_QP_CREATE_NONE,
> 
> > +	QEDR_QP_CREATE_USER,
> 
> > +	QEDR_QP_CREATE_KERNEL,
> 
> > +};
> 
> > +
> 
> > +enum qedr_iwarp_cm_flags {
> 
> > +	QEDR_IWARP_CM_WAIT_FOR_CONNECT    = BIT(0),
> 
> > +	QEDR_IWARP_CM_WAIT_FOR_DISCONNECT = BIT(1),
> 
> > +};
> 
> > +
> 
> >  struct qedr_qp {
> 
> >  	struct ib_qp ibqp;	/* must be first */
> 
> >  	struct qedr_dev *dev;
> 
> > -	struct qedr_iw_ep *ep;
> 
> >  	struct qedr_qp_hwq_info sq;
> 
> >  	struct qedr_qp_hwq_info rq;
> 
> >
> 
> > @@ -395,6 +406,7 @@ struct qedr_qp {
> 
> >  	u32 id;
> 
> >  	struct qedr_pd *pd;
> 
> >  	enum ib_qp_type qp_type;
> 
> > +	enum qedr_qp_create_type create_type;
> 
> >  	struct qed_rdma_qp *qed_qp;
> 
> >  	u32 qp_id;
> 
> >  	u16 icid;
> 
> > @@ -437,8 +449,11 @@ struct qedr_qp {
> 
> >  	/* Relevant to qps created from user space only (applications) */
> 
> >  	struct qedr_userq usq;
> 
> >  	struct qedr_userq urq;
> 
> > -	atomic_t refcnt;
> 
> > -	bool destroyed;
> 
> > +
> 
> > +	/* synchronization objects used with iwarp ep */
> 
> > +	struct kref refcnt;
> 
> > +	struct completion iwarp_cm_comp;
> 
> > +	unsigned long iwarp_cm_flags; /* enum iwarp_cm_flags */
> 
> >  };
> 
> >
> 
> >  struct qedr_ah {
> 
> > @@ -531,7 +546,7 @@ struct qedr_iw_ep {
> 
> >  	struct iw_cm_id	*cm_id;
> 
> >  	struct qedr_qp	*qp;
> 
> >  	void		*qed_context;
> 
> > -	u8		during_connect;
> 
> > +	struct kref	refcnt;
> 
> >  };
> 
> >
> 
> >  static inline
> 
> > diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c
> b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
> 
> > index 22881d4442b9..26204caf0975 100644
> 
> > +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
> 
> > @@ -79,6 +79,28 @@ qedr_fill_sockaddr6(const struct qed_iwarp_cm_info
> *cm_info,
> 
> >  	}
> 
> >  }
> 
> >
> 
> > +static void qedr_iw_free_qp(struct kref *ref)
> 
> > +{
> 
> > +	struct qedr_qp *qp = container_of(ref, struct qedr_qp, refcnt);
> 
> > +
> 
> > +	xa_erase(&qp->dev->qps, qp->qp_id);
> 
> 
> 
> This probably doesn't work right because qp_id is not derived from the
> 
> xa_array, but some external entity.
> 
> 
> 
> Thus the qp_id should be removed from the xarray and kref put'd right
> 
> before the qp_id is deallocated from the external manager.
> 
Thanks! This is a good point.
I will remove the element from xarray and kref put before deallocation, as you mentioned, 
But will wait for the kref-free function to be called ( perhaps using completion ) 
Before deallocation. 

> 
> 
> Ie you want to avoid a race where the qp_id can be re-assigned but the
> 
> xarray entry hasn't been freed by its kref yet. Then it would
> 
> xa_insert and fail.
> 
> 
> 
> Also the xa_insert is probably not supposed to be the _irq version
Right, this was originally an idr which was modified with the massive patchset
That changed all idrs to xarrays. I'll re-review it.

> 
> either.
> 
> 
> 
> > @@ -224,13 +249,18 @@ qedr_iw_disconnect_event(void *context,
> 
> >  	struct qedr_discon_work *work;
> 
> >  	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
> 
> >  	struct qedr_dev *dev = ep->dev;
> 
> > -	struct qedr_qp *qp = ep->qp;
> 
> >
> 
> >  	work = kzalloc(sizeof(*work), GFP_ATOMIC);
> 
> >  	if (!work)
> 
> >  		return;
> 
> >
> 
> > -	qedr_iw_qp_add_ref(&qp->ibqp);
> 
> > +	/* We can't get a close event before disconnect, but since
> 
> > +	 * we're scheduling a work queue we need to make sure close
> 
> > +	 * won't delete the ep, so we increase the refcnt
> 
> > +	 */
> 
> > +	if (!kref_get_unless_zero(&ep->refcnt))
> 
> > +		return;
> 
> 
> 
> The unless_zero version should not be used without some kind of
> 
> locking like this, if you have a pointer to ep then ep must be a valid
> 
> pointer and it is safe to take a kref on it.
> 
> 
> 
> If there is a risk it is not valid then this is racy in a way that
> 
> only locking can fix, not unless_zero
Ok, ep is valid here, I'll modify to kref_get. Thanks.

> 
> 
> 
> > @@ -476,6 +508,19 @@ qedr_addr6_resolve(struct qedr_dev *dev,
> 
> >  	return rc;
> 
> >  }
> 
> >
> 
> > +struct qedr_qp *qedr_iw_load_qp(struct qedr_dev *dev, u32 qpn)
> 
> > +{
> 
> > +	struct qedr_qp *qp;
> 
> > +
> 
> > +	xa_lock(&dev->qps);
> 
> > +	qp = xa_load(&dev->qps, qpn);
> 
> > +	if (!qp || !kref_get_unless_zero(&qp->refcnt))
> 
> > +		qp = NULL;
> 
> 
> 
> See, here is is OK because qp can't be freed under the xa_lock.
> 
> 
> 
> However, this unless_zero also will not be needed once the xa_erase is
> 
> moved to the right spot.
Right. Thanks.

> 
> 
> 
> > +		/* Wait for the connection setup to complete */
> 
> > +		if
> (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT,
> 
> > +				     &qp->iwarp_cm_flags))
> 
> > +			wait_for_completion(&qp->iwarp_cm_comp);
> 
> > +
> 
> > +		if
> (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_DISCONNECT,
> 
> > +				     &qp->iwarp_cm_flags))
> 
> > +			wait_for_completion(&qp->iwarp_cm_comp);
> 
> >  	}
> 
> 
> 
> These atomics seem mis-named, and I'm unclear how they can both be
> 
> waiting on the same completion?

In IWARP the connection establishment and disconnect are offloaded to hw and asynchronous. 
The first waits for CONNECT to complete. (completes asynchronously) 
If we are in the middle of a connect that was offloaded (or after connect) the bit will be on and completion will be completed
once the connection is fully established.
We want to wait before destroying the qp due to hw constraints (can't destroy during connect).

The seconds waits for DISCONNECT to complete ( doesn't always occur, only if a graceful disconnect was initiated on either side)
Disconnect can't occur on a connection not established yet, so we can't get the completion of the disconnect instead. 
Similar for connect, once we start the disconnect we turn on the bit and will complete once the disconnect completes. 

I didn't see a reason to use another completion structure, since from what I read complete does comp->done++ and wait_for_completion
checks done and eventually does done-- so it can be used several times if there is no need to distinguish which event occurred first
(in this case there is only one option first connect then disconnect and disconnect can't occur without connect)

But if this is wrong I will add another completion structure.

Thanks,
Michal
> 
> 
> 
> > @@ -2490,11 +2488,11 @@ int qedr_destroy_qp(struct ib_qp *ibqp, struct
> ib_udata *udata)
> 
> >
> 
> >  	qedr_free_qp_resources(dev, qp, udata);
> 
> >
> 
> > -	if (atomic_dec_and_test(&qp->refcnt) &&
> 
> > -	    rdma_protocol_iwarp(&dev->ibdev, 1)) {
> 
> > -		xa_erase_irq(&dev->qps, qp->qp_id);
> 
> 
> 
> This is probably too late, it should be done before the qp_id could be
> 
> recycled.
> 
> 
> 
> Jason
Jason Gunthorpe Oct. 23, 2019, 2:43 p.m. UTC | #3
On Wed, Oct 23, 2019 at 09:49:14AM +0000, Michal Kalderon wrote:
> > 
> > > +		/* Wait for the connection setup to complete */
> > 
> > > +		if
> > (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT,
> > 
> > > +				     &qp->iwarp_cm_flags))
> > 
> > > +			wait_for_completion(&qp->iwarp_cm_comp);
> > 
> > > +
> > 
> > > +		if
> > (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_DISCONNECT,
> > 
> > > +				     &qp->iwarp_cm_flags))
> > 
> > > +			wait_for_completion(&qp->iwarp_cm_comp);
> > 
> > >  	}
> > 
> > 
> > 
> > These atomics seem mis-named, and I'm unclear how they can both be
> > 
> > waiting on the same completion?
> 
> In IWARP the connection establishment and disconnect are offloaded to hw and asynchronous. 
> The first waits for CONNECT to complete. (completes asynchronously) 
> If we are in the middle of a connect that was offloaded (or after connect) the bit will be on and completion will be completed
> once the connection is fully established.
> We want to wait before destroying the qp due to hw constraints (can't destroy during connect).

The atomics seem to be used in a 'something in progress' kind of way
as the first thing to get the 'set' does whatever is happening here or
waits for the otherone to finish doing it.

> I didn't see a reason to use another completion structure, since
> from what I read complete does comp->done++ and wait_for_completion
> checks done and eventually does done-- so it can be used several
> times if there is no need to distinguish which event occurred first
> (in this case there is only one option first connect then disconnect
> and disconnect can't occur without connect)

I suppose it is OK here because both completion waits are sequential,
but generally it is kind of sketchy as when
QEDR_IWARP_CM_WAIT_FOR_CONNECT completes it will wake up something
waiting for QEDR_IWARP_CM_WAIT_FOR_DISCONNECT as well.

Jason
Michal Kalderon Oct. 24, 2019, 12:19 p.m. UTC | #4
> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Michal Kalderon
> 
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Tuesday, October 22, 2019 10:36 PM
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > On Wed, Oct 16, 2019 at 02:42:41PM +0300, Michal Kalderon wrote:
> >
> > > Re-design of the iWARP CM related objects reference
> >
> > > counting and synchronization methods, to ensure operations
> >
> > > are synchronized correctly and and that memory allocated for "ep"
> >
> > > is released. (was leaked).
> >
> > > Also makes sure QP memory is not released before ep is finished
> >
> > > accessing it.
> >
> > >
> >
> > > Where as the QP object is created/destroyed by external operations,
> >
> > > the ep is created/destroyed by internal operations and represents
> >
> > > the tcp connection associated with the QP.
> >
> > >
> >
> > > QP destruction flow:
> >
> > > - needs to wait for ep establishment to complete (either
> > > successfully
> >
> > >   or with error)
> >
> > > - needs to wait for ep disconnect to be fully posted to avoid a
> >
> > >   race condition of disconnect being called after reset.
> >
> > > - both the operations above don't always happen, so we use atomic
> >
> > >   flags to indicate whether the qp destruction flow needs to wait
> >
> > >   for these completions or not, if the destroy is called before
> >
> > >   these operations began, the flows will check the flags and not
> >
> > >   execute them ( connect / disconnect).
> >
> > >
> >
> > > We use completion structure for waiting for the completions
> > > mentioned
> >
> > > above.
> >
> > >
> >
> > > The QP refcnt was modified to kref object.
> >
> > > The EP has a kref added to it to handle additional worker thread
> >
> > > accessing it.
> >
> > >
> >
> > > Memory Leaks - https://urldefense.proofpoint.com/v2/url?u=https-
> > 3A__www.spinics.net_lists_linux-
> >
> 2Drdma_msg83762.html&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=7Yun
> > NpwaTtA-c31OjGDRlmf54csBCXY_j41vn-0xRz4&m=DfexSyvBpCQbEUyzV-
> > q_oGHiJYpLrH4Igtg963UsuZs&s=XNxYMW-
> > rrECcE1vRoUReaYZcb01cMCx9X9fs_clAU1Y&e=
> >
> > > Reported-by Chuck Lever <chuck.lever@oracle.com>
> >
> > >
> >
> > > Concurrency not managed correctly -
> >
> > > https://urldefense.proofpoint.com/v2/url?u=https-
> > 3A__www.spinics.net_lists_linux-
> >
> 2Drdma_msg67949.html&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=7Yun
> > NpwaTtA-c31OjGDRlmf54csBCXY_j41vn-0xRz4&m=DfexSyvBpCQbEUyzV-
> >
> q_oGHiJYpLrH4Igtg963UsuZs&s=zgMS6HLPvQCHcCwmmXuA8EqpZV1cX_DL-
> > oPqtLJv2vs&e=
> >
> > > Reported-by Jason Gunthorpe <jgg@ziepe.ca>
> >
> > >
> >
> > > Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
> >
> > > Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
> >
> > >  drivers/infiniband/hw/qedr/qedr.h       |  23 ++++-
> >
> > >  drivers/infiniband/hw/qedr/qedr_iw_cm.c | 150
> > ++++++++++++++++++++++----------
> >
> > >  drivers/infiniband/hw/qedr/verbs.c      |  42 +++++----
> >
> > >  3 files changed, 141 insertions(+), 74 deletions(-)
> >
> > >
> >
> > > diff --git a/drivers/infiniband/hw/qedr/qedr.h
> > b/drivers/infiniband/hw/qedr/qedr.h
> >
> > > index 0cfd849b13d6..8e927f6c1520 100644
> >
> > > +++ b/drivers/infiniband/hw/qedr/qedr.h
> >
> > > @@ -40,6 +40,7 @@
> >
> > >  #include <linux/qed/qed_rdma_if.h>
> >
> > >  #include <linux/qed/qede_rdma.h>
> >
> > >  #include <linux/qed/roce_common.h>
> >
> > > +#include <linux/completion.h>
> >
> > >  #include "qedr_hsi_rdma.h"
> >
> > >
> >
> > >  #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA"
> >
> > > @@ -377,10 +378,20 @@ enum qedr_qp_err_bitmap {
> >
> > >  	QEDR_QP_ERR_RQ_PBL_FULL = 32,
> >
> > >  };
> >
> > >
> >
> > > +enum qedr_qp_create_type {
> >
> > > +	QEDR_QP_CREATE_NONE,
> >
> > > +	QEDR_QP_CREATE_USER,
> >
> > > +	QEDR_QP_CREATE_KERNEL,
> >
> > > +};
> >
> > > +
> >
> > > +enum qedr_iwarp_cm_flags {
> >
> > > +	QEDR_IWARP_CM_WAIT_FOR_CONNECT    = BIT(0),
> >
> > > +	QEDR_IWARP_CM_WAIT_FOR_DISCONNECT = BIT(1),
> >
> > > +};
> >
> > > +
> >
> > >  struct qedr_qp {
> >
> > >  	struct ib_qp ibqp;	/* must be first */
> >
> > >  	struct qedr_dev *dev;
> >
> > > -	struct qedr_iw_ep *ep;
> >
> > >  	struct qedr_qp_hwq_info sq;
> >
> > >  	struct qedr_qp_hwq_info rq;
> >
> > >
> >
> > > @@ -395,6 +406,7 @@ struct qedr_qp {
> >
> > >  	u32 id;
> >
> > >  	struct qedr_pd *pd;
> >
> > >  	enum ib_qp_type qp_type;
> >
> > > +	enum qedr_qp_create_type create_type;
> >
> > >  	struct qed_rdma_qp *qed_qp;
> >
> > >  	u32 qp_id;
> >
> > >  	u16 icid;
> >
> > > @@ -437,8 +449,11 @@ struct qedr_qp {
> >
> > >  	/* Relevant to qps created from user space only (applications) */
> >
> > >  	struct qedr_userq usq;
> >
> > >  	struct qedr_userq urq;
> >
> > > -	atomic_t refcnt;
> >
> > > -	bool destroyed;
> >
> > > +
> >
> > > +	/* synchronization objects used with iwarp ep */
> >
> > > +	struct kref refcnt;
> >
> > > +	struct completion iwarp_cm_comp;
> >
> > > +	unsigned long iwarp_cm_flags; /* enum iwarp_cm_flags */
> >
> > >  };
> >
> > >
> >
> > >  struct qedr_ah {
> >
> > > @@ -531,7 +546,7 @@ struct qedr_iw_ep {
> >
> > >  	struct iw_cm_id	*cm_id;
> >
> > >  	struct qedr_qp	*qp;
> >
> > >  	void		*qed_context;
> >
> > > -	u8		during_connect;
> >
> > > +	struct kref	refcnt;
> >
> > >  };
> >
> > >
> >
> > >  static inline
> >
> > > diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c
> > b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
> >
> > > index 22881d4442b9..26204caf0975 100644
> >
> > > +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
> >
> > > @@ -79,6 +79,28 @@ qedr_fill_sockaddr6(const struct
> > > qed_iwarp_cm_info
> > *cm_info,
> >
> > >  	}
> >
> > >  }
> >
> > >
> >
> > > +static void qedr_iw_free_qp(struct kref *ref)
> >
> > > +{
> >
> > > +	struct qedr_qp *qp = container_of(ref, struct qedr_qp, refcnt);
> >
> > > +
> >
> > > +	xa_erase(&qp->dev->qps, qp->qp_id);
> >
> >
> >
> > This probably doesn't work right because qp_id is not derived from the
> >
> > xa_array, but some external entity.
> >
> >
> >
> > Thus the qp_id should be removed from the xarray and kref put'd right
> >
> > before the qp_id is deallocated from the external manager.
> >
> Thanks! This is a good point.
> I will remove the element from xarray and kref put before deallocation, as
> you mentioned, But will wait for the kref-free function to be called ( perhaps
> using completion ) Before deallocation.
Hi Jason, 

Actually I was wrong about this one. I can't wait for the kref-free function to be called, 
As it requires the deallocation flow to complete for ep->qp reference to decrease. 
I think I can simply call xa_erase before the deallocation flow and not as part of the
Kref release function and call kref_put for the qp object at the end of the flow after
everything is freed so that qp can be freed.

> 
> >
> >
> > Ie you want to avoid a race where the qp_id can be re-assigned but the
> >
> > xarray entry hasn't been freed by its kref yet. Then it would
> >
> > xa_insert and fail.
> >
> >
> >
> > Also the xa_insert is probably not supposed to be the _irq version
> Right, this was originally an idr which was modified with the massive patchset
> That changed all idrs to xarrays. I'll re-review it.
> 
> >
> > either.
> >
> >
> >
> > > @@ -224,13 +249,18 @@ qedr_iw_disconnect_event(void *context,
> >
> > >  	struct qedr_discon_work *work;
> >
> > >  	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
> >
> > >  	struct qedr_dev *dev = ep->dev;
> >
> > > -	struct qedr_qp *qp = ep->qp;
> >
> > >
> >
> > >  	work = kzalloc(sizeof(*work), GFP_ATOMIC);
> >
> > >  	if (!work)
> >
> > >  		return;
> >
> > >
> >
> > > -	qedr_iw_qp_add_ref(&qp->ibqp);
> >
> > > +	/* We can't get a close event before disconnect, but since
> >
> > > +	 * we're scheduling a work queue we need to make sure close
> >
> > > +	 * won't delete the ep, so we increase the refcnt
> >
> > > +	 */
> >
> > > +	if (!kref_get_unless_zero(&ep->refcnt))
> >
> > > +		return;
> >
> >
> >
> > The unless_zero version should not be used without some kind of
> >
> > locking like this, if you have a pointer to ep then ep must be a valid
> >
> > pointer and it is safe to take a kref on it.
> >
> >
> >
> > If there is a risk it is not valid then this is racy in a way that
> >
> > only locking can fix, not unless_zero
> Ok, ep is valid here, I'll modify to kref_get. Thanks.
> 
> >
> >
> >
> > > @@ -476,6 +508,19 @@ qedr_addr6_resolve(struct qedr_dev *dev,
> >
> > >  	return rc;
> >
> > >  }
> >
> > >
> >
> > > +struct qedr_qp *qedr_iw_load_qp(struct qedr_dev *dev, u32 qpn)
> >
> > > +{
> >
> > > +	struct qedr_qp *qp;
> >
> > > +
> >
> > > +	xa_lock(&dev->qps);
> >
> > > +	qp = xa_load(&dev->qps, qpn);
> >
> > > +	if (!qp || !kref_get_unless_zero(&qp->refcnt))
> >
> > > +		qp = NULL;
> >
> >
> >
> > See, here is is OK because qp can't be freed under the xa_lock.
> >
> >
> >
> > However, this unless_zero also will not be needed once the xa_erase is
> >
> > moved to the right spot.
> Right. Thanks.
> 
> >
> >
> >
> > > +		/* Wait for the connection setup to complete */
> >
> > > +		if
> > (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT,
> >
> > > +				     &qp->iwarp_cm_flags))
> >
> > > +			wait_for_completion(&qp->iwarp_cm_comp);
> >
> > > +
> >
> > > +		if
> > (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_DISCONNECT,
> >
> > > +				     &qp->iwarp_cm_flags))
> >
> > > +			wait_for_completion(&qp->iwarp_cm_comp);
> >
> > >  	}
> >
> >
> >
> > These atomics seem mis-named, and I'm unclear how they can both be
> >
> > waiting on the same completion?
> 
> In IWARP the connection establishment and disconnect are offloaded to hw
> and asynchronous.
> The first waits for CONNECT to complete. (completes asynchronously) If we
> are in the middle of a connect that was offloaded (or after connect) the bit
> will be on and completion will be completed once the connection is fully
> established.
> We want to wait before destroying the qp due to hw constraints (can't
> destroy during connect).
> 
> The seconds waits for DISCONNECT to complete ( doesn't always occur, only
> if a graceful disconnect was initiated on either side) Disconnect can't occur on
> a connection not established yet, so we can't get the completion of the
> disconnect instead.
> Similar for connect, once we start the disconnect we turn on the bit and will
> complete once the disconnect completes.
> 
> I didn't see a reason to use another completion structure, since from what I
> read complete does comp->done++ and wait_for_completion checks done
> and eventually does done-- so it can be used several times if there is no need
> to distinguish which event occurred first (in this case there is only one option
> first connect then disconnect and disconnect can't occur without connect)
> 
> But if this is wrong I will add another completion structure.
> 
> Thanks,
> Michal
> >
> >
> >
> > > @@ -2490,11 +2488,11 @@ int qedr_destroy_qp(struct ib_qp *ibqp,
> > > struct
> > ib_udata *udata)
> >
> > >
> >
> > >  	qedr_free_qp_resources(dev, qp, udata);
> >
> > >
> >
> > > -	if (atomic_dec_and_test(&qp->refcnt) &&
> >
> > > -	    rdma_protocol_iwarp(&dev->ibdev, 1)) {
> >
> > > -		xa_erase_irq(&dev->qps, qp->qp_id);
> >
> >
> >
> > This is probably too late, it should be done before the qp_id could be
> >
> > recycled.
> >
> >
> >
> > Jason

Patch
diff mbox series

diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index 0cfd849b13d6..8e927f6c1520 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -40,6 +40,7 @@ 
 #include <linux/qed/qed_rdma_if.h>
 #include <linux/qed/qede_rdma.h>
 #include <linux/qed/roce_common.h>
+#include <linux/completion.h>
 #include "qedr_hsi_rdma.h"
 
 #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA"
@@ -377,10 +378,20 @@  enum qedr_qp_err_bitmap {
 	QEDR_QP_ERR_RQ_PBL_FULL = 32,
 };
 
+enum qedr_qp_create_type {
+	QEDR_QP_CREATE_NONE,
+	QEDR_QP_CREATE_USER,
+	QEDR_QP_CREATE_KERNEL,
+};
+
+enum qedr_iwarp_cm_flags {
+	QEDR_IWARP_CM_WAIT_FOR_CONNECT    = BIT(0),
+	QEDR_IWARP_CM_WAIT_FOR_DISCONNECT = BIT(1),
+};
+
 struct qedr_qp {
 	struct ib_qp ibqp;	/* must be first */
 	struct qedr_dev *dev;
-	struct qedr_iw_ep *ep;
 	struct qedr_qp_hwq_info sq;
 	struct qedr_qp_hwq_info rq;
 
@@ -395,6 +406,7 @@  struct qedr_qp {
 	u32 id;
 	struct qedr_pd *pd;
 	enum ib_qp_type qp_type;
+	enum qedr_qp_create_type create_type;
 	struct qed_rdma_qp *qed_qp;
 	u32 qp_id;
 	u16 icid;
@@ -437,8 +449,11 @@  struct qedr_qp {
 	/* Relevant to qps created from user space only (applications) */
 	struct qedr_userq usq;
 	struct qedr_userq urq;
-	atomic_t refcnt;
-	bool destroyed;
+
+	/* synchronization objects used with iwarp ep */
+	struct kref refcnt;
+	struct completion iwarp_cm_comp;
+	unsigned long iwarp_cm_flags; /* enum iwarp_cm_flags */
 };
 
 struct qedr_ah {
@@ -531,7 +546,7 @@  struct qedr_iw_ep {
 	struct iw_cm_id	*cm_id;
 	struct qedr_qp	*qp;
 	void		*qed_context;
-	u8		during_connect;
+	struct kref	refcnt;
 };
 
 static inline
diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
index 22881d4442b9..26204caf0975 100644
--- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
@@ -79,6 +79,28 @@  qedr_fill_sockaddr6(const struct qed_iwarp_cm_info *cm_info,
 	}
 }
 
+static void qedr_iw_free_qp(struct kref *ref)
+{
+	struct qedr_qp *qp = container_of(ref, struct qedr_qp, refcnt);
+
+	xa_erase(&qp->dev->qps, qp->qp_id);
+	kfree(qp);
+}
+
+static void
+qedr_iw_free_ep(struct kref *ref)
+{
+	struct qedr_iw_ep *ep = container_of(ref, struct qedr_iw_ep, refcnt);
+
+	if (ep->qp)
+		kref_put(&ep->qp->refcnt, qedr_iw_free_qp);
+
+	if (ep->cm_id)
+		ep->cm_id->rem_ref(ep->cm_id);
+
+	kfree(ep);
+}
+
 static void
 qedr_iw_mpa_request(void *context, struct qed_iwarp_cm_event_params *params)
 {
@@ -93,6 +115,7 @@  qedr_iw_mpa_request(void *context, struct qed_iwarp_cm_event_params *params)
 
 	ep->dev = dev;
 	ep->qed_context = params->ep_context;
+	kref_init(&ep->refcnt);
 
 	memset(&event, 0, sizeof(event));
 	event.event = IW_CM_EVENT_CONNECT_REQUEST;
@@ -141,12 +164,10 @@  qedr_iw_close_event(void *context, struct qed_iwarp_cm_event_params *params)
 {
 	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
 
-	if (ep->cm_id) {
+	if (ep->cm_id)
 		qedr_iw_issue_event(context, params, IW_CM_EVENT_CLOSE);
 
-		ep->cm_id->rem_ref(ep->cm_id);
-		ep->cm_id = NULL;
-	}
+	kref_put(&ep->refcnt, qedr_iw_free_ep);
 }
 
 static void
@@ -186,11 +207,13 @@  static void qedr_iw_disconnect_worker(struct work_struct *work)
 	struct qedr_qp *qp = ep->qp;
 	struct iw_cm_event event;
 
-	if (qp->destroyed) {
-		kfree(dwork);
-		qedr_iw_qp_rem_ref(&qp->ibqp);
-		return;
-	}
+	/* The qp won't be released until we release the ep.
+	 * the ep's refcnt was increased before calling this
+	 * function, therefore it is safe to access qp
+	 */
+	if (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_DISCONNECT,
+			     &qp->iwarp_cm_flags))
+		goto out;
 
 	memset(&event, 0, sizeof(event));
 	event.status = dwork->status;
@@ -204,7 +227,6 @@  static void qedr_iw_disconnect_worker(struct work_struct *work)
 	else
 		qp_params.new_state = QED_ROCE_QP_STATE_SQD;
 
-	kfree(dwork);
 
 	if (ep->cm_id)
 		ep->cm_id->event_handler(ep->cm_id, &event);
@@ -214,7 +236,10 @@  static void qedr_iw_disconnect_worker(struct work_struct *work)
 
 	dev->ops->rdma_modify_qp(dev->rdma_ctx, qp->qed_qp, &qp_params);
 
-	qedr_iw_qp_rem_ref(&qp->ibqp);
+	complete(&ep->qp->iwarp_cm_comp);
+out:
+	kfree(dwork);
+	kref_put(&ep->refcnt, qedr_iw_free_ep);
 }
 
 static void
@@ -224,13 +249,18 @@  qedr_iw_disconnect_event(void *context,
 	struct qedr_discon_work *work;
 	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
 	struct qedr_dev *dev = ep->dev;
-	struct qedr_qp *qp = ep->qp;
 
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work)
 		return;
 
-	qedr_iw_qp_add_ref(&qp->ibqp);
+	/* We can't get a close event before disconnect, but since
+	 * we're scheduling a work queue we need to make sure close
+	 * won't delete the ep, so we increase the refcnt
+	 */
+	if (!kref_get_unless_zero(&ep->refcnt))
+		return;
+
 	work->ep = ep;
 	work->event = params->event;
 	work->status = params->status;
@@ -252,16 +282,30 @@  qedr_iw_passive_complete(void *context,
 	if ((params->status == -ECONNREFUSED) && (!ep->qp)) {
 		DP_DEBUG(dev, QEDR_MSG_IWARP,
 			 "PASSIVE connection refused releasing ep...\n");
-		kfree(ep);
+		kref_put(&ep->refcnt, qedr_iw_free_ep);
 		return;
 	}
 
+	complete(&ep->qp->iwarp_cm_comp);
 	qedr_iw_issue_event(context, params, IW_CM_EVENT_ESTABLISHED);
 
 	if (params->status < 0)
 		qedr_iw_close_event(context, params);
 }
 
+static void
+qedr_iw_active_complete(void *context,
+			struct qed_iwarp_cm_event_params *params)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+
+	complete(&ep->qp->iwarp_cm_comp);
+	qedr_iw_issue_event(context, params, IW_CM_EVENT_CONNECT_REPLY);
+
+	if (params->status < 0)
+		kref_put(&ep->refcnt, qedr_iw_free_ep);
+}
+
 static int
 qedr_iw_mpa_reply(void *context, struct qed_iwarp_cm_event_params *params)
 {
@@ -288,27 +332,15 @@  qedr_iw_event_handler(void *context, struct qed_iwarp_cm_event_params *params)
 		qedr_iw_mpa_reply(context, params);
 		break;
 	case QED_IWARP_EVENT_PASSIVE_COMPLETE:
-		ep->during_connect = 0;
 		qedr_iw_passive_complete(context, params);
 		break;
-
 	case QED_IWARP_EVENT_ACTIVE_COMPLETE:
-		ep->during_connect = 0;
-		qedr_iw_issue_event(context,
-				    params,
-				    IW_CM_EVENT_CONNECT_REPLY);
-		if (params->status < 0) {
-			struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
-
-			ep->cm_id->rem_ref(ep->cm_id);
-			ep->cm_id = NULL;
-		}
+		qedr_iw_active_complete(context, params);
 		break;
 	case QED_IWARP_EVENT_DISCONNECT:
 		qedr_iw_disconnect_event(context, params);
 		break;
 	case QED_IWARP_EVENT_CLOSE:
-		ep->during_connect = 0;
 		qedr_iw_close_event(context, params);
 		break;
 	case QED_IWARP_EVENT_RQ_EMPTY:
@@ -476,6 +508,19 @@  qedr_addr6_resolve(struct qedr_dev *dev,
 	return rc;
 }
 
+struct qedr_qp *qedr_iw_load_qp(struct qedr_dev *dev, u32 qpn)
+{
+	struct qedr_qp *qp;
+
+	xa_lock(&dev->qps);
+	qp = xa_load(&dev->qps, qpn);
+	if (!qp || !kref_get_unless_zero(&qp->refcnt))
+		qp = NULL;
+	xa_unlock(&dev->qps);
+
+	return qp;
+}
+
 int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 {
 	struct qedr_dev *dev = get_qedr_dev(cm_id->device);
@@ -491,10 +536,6 @@  int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	int rc = 0;
 	int i;
 
-	qp = xa_load(&dev->qps, conn_param->qpn);
-	if (unlikely(!qp))
-		return -EINVAL;
-
 	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
 	raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
 	laddr6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
@@ -516,8 +557,15 @@  int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 		return -ENOMEM;
 
 	ep->dev = dev;
+	kref_init(&ep->refcnt);
+
+	qp = qedr_iw_load_qp(dev, conn_param->qpn);
+	if (!qp) {
+		rc = -EINVAL;
+		goto err;
+	}
+
 	ep->qp = qp;
-	qp->ep = ep;
 	cm_id->add_ref(cm_id);
 	ep->cm_id = cm_id;
 
@@ -580,16 +628,20 @@  int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	in_params.qp = qp->qed_qp;
 	memcpy(in_params.local_mac_addr, dev->ndev->dev_addr, ETH_ALEN);
 
-	ep->during_connect = 1;
+	if (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT,
+			     &qp->iwarp_cm_flags))
+		goto err; /* QP already being destroyed */
+
 	rc = dev->ops->iwarp_connect(dev->rdma_ctx, &in_params, &out_params);
-	if (rc)
+	if (rc) {
+		complete(&qp->iwarp_cm_comp);
 		goto err;
+	}
 
 	return rc;
 
 err:
-	cm_id->rem_ref(cm_id);
-	kfree(ep);
+	kref_put(&ep->refcnt, qedr_iw_free_ep);
 	return rc;
 }
 
@@ -677,18 +729,17 @@  int qedr_iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	struct qedr_dev *dev = ep->dev;
 	struct qedr_qp *qp;
 	struct qed_iwarp_accept_in params;
-	int rc;
+	int rc = 0;
 
 	DP_DEBUG(dev, QEDR_MSG_IWARP, "Accept on qpid=%d\n", conn_param->qpn);
 
-	qp = xa_load(&dev->qps, conn_param->qpn);
+	qp = qedr_iw_load_qp(dev, conn_param->qpn);
 	if (!qp) {
 		DP_ERR(dev, "Invalid QP number %d\n", conn_param->qpn);
 		return -EINVAL;
 	}
 
 	ep->qp = qp;
-	qp->ep = ep;
 	cm_id->add_ref(cm_id);
 	ep->cm_id = cm_id;
 
@@ -700,15 +751,21 @@  int qedr_iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	params.ird = conn_param->ird;
 	params.ord = conn_param->ord;
 
-	ep->during_connect = 1;
+	if (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT,
+			     &qp->iwarp_cm_flags))
+		goto err; /* QP already destroyed */
+
 	rc = dev->ops->iwarp_accept(dev->rdma_ctx, &params);
-	if (rc)
+	if (rc) {
+		complete(&qp->iwarp_cm_comp);
 		goto err;
+	}
 
 	return rc;
+
 err:
-	ep->during_connect = 0;
-	cm_id->rem_ref(cm_id);
+	kref_put(&ep->refcnt, qedr_iw_free_ep);
+
 	return rc;
 }
 
@@ -731,17 +788,14 @@  void qedr_iw_qp_add_ref(struct ib_qp *ibqp)
 {
 	struct qedr_qp *qp = get_qedr_qp(ibqp);
 
-	atomic_inc(&qp->refcnt);
+	kref_get(&qp->refcnt);
 }
 
 void qedr_iw_qp_rem_ref(struct ib_qp *ibqp)
 {
 	struct qedr_qp *qp = get_qedr_qp(ibqp);
 
-	if (atomic_dec_and_test(&qp->refcnt)) {
-		xa_erase_irq(&qp->dev->qps, qp->qp_id);
-		kfree(qp);
-	}
+	kref_put(&qp->refcnt, qedr_iw_free_qp);
 }
 
 struct ib_qp *qedr_iw_get_qp(struct ib_device *ibdev, int qpn)
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 6f3ce86019b7..dbb0b0000594 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -51,6 +51,7 @@ 
 #include "verbs.h"
 #include <rdma/qedr-abi.h>
 #include "qedr_roce_cm.h"
+#include "qedr_iw_cm.h"
 
 #define QEDR_SRQ_WQE_ELEM_SIZE	sizeof(union rdma_srq_elm)
 #define	RDMA_MAX_SGE_PER_SRQ	(4)
@@ -1193,7 +1194,10 @@  static void qedr_set_common_qp_params(struct qedr_dev *dev,
 				      struct ib_qp_init_attr *attrs)
 {
 	spin_lock_init(&qp->q_lock);
-	atomic_set(&qp->refcnt, 1);
+	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
+		kref_init(&qp->refcnt);
+		init_completion(&qp->iwarp_cm_comp);
+	}
 	qp->pd = pd;
 	qp->qp_type = attrs->qp_type;
 	qp->max_inline_data = attrs->cap.max_inline_data;
@@ -1592,6 +1596,7 @@  static int qedr_create_user_qp(struct qedr_dev *dev,
 	int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1);
 	int rc = -EINVAL;
 
+	qp->create_type = QEDR_QP_CREATE_USER;
 	memset(&ureq, 0, sizeof(ureq));
 	rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
 	if (rc) {
@@ -1805,6 +1810,7 @@  static int qedr_create_kernel_qp(struct qedr_dev *dev,
 	u32 n_sq_entries;
 
 	memset(&in_params, 0, sizeof(in_params));
+	qp->create_type = QEDR_QP_CREATE_KERNEL;
 
 	/* A single work request may take up to QEDR_MAX_SQ_WQE_SIZE elements in
 	 * the ring. The ring should allow at least a single WR, even if the
@@ -2437,7 +2443,7 @@  static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp,
 			return rc;
 	}
 
-	if (udata)
+	if (qp->create_type == QEDR_QP_CREATE_USER)
 		qedr_cleanup_user(dev, qp);
 	else
 		qedr_cleanup_kernel(dev, qp);
@@ -2467,22 +2473,14 @@  int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 			qedr_modify_qp(ibqp, &attr, attr_mask, NULL);
 		}
 	} else {
-		/* Wait for the connect/accept to complete */
-		if (qp->ep) {
-			int wait_count = 1;
-
-			while (qp->ep->during_connect) {
-				DP_DEBUG(dev, QEDR_MSG_QP,
-					 "Still in during connect/accept\n");
-
-				msleep(100);
-				if (wait_count++ > 200) {
-					DP_NOTICE(dev,
-						  "during connect timeout\n");
-					break;
-				}
-			}
-		}
+		/* Wait for the connection setup to complete */
+		if (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT,
+				     &qp->iwarp_cm_flags))
+			wait_for_completion(&qp->iwarp_cm_comp);
+
+		if (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_DISCONNECT,
+				     &qp->iwarp_cm_flags))
+			wait_for_completion(&qp->iwarp_cm_comp);
 	}
 
 	if (qp->qp_type == IB_QPT_GSI)
@@ -2490,11 +2488,11 @@  int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 
 	qedr_free_qp_resources(dev, qp, udata);
 
-	if (atomic_dec_and_test(&qp->refcnt) &&
-	    rdma_protocol_iwarp(&dev->ibdev, 1)) {
-		xa_erase_irq(&dev->qps, qp->qp_id);
+	if (rdma_protocol_iwarp(&dev->ibdev, 1))
+		qedr_iw_qp_rem_ref(&qp->ibqp);
+	else
 		kfree(qp);
-	}
+
 	return 0;
 }