diff mbox series

[rdma-next,v2,7/9] IB/cm: Clear all associated AV's ports when remove a cm device

Message ID 00c97755c41b06af84f621a1b3e0e8adfe0771cc.1619004798.git.leonro@nvidia.com (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series Fix memory corruption in CM | expand

Commit Message

Leon Romanovsky April 21, 2021, 11:40 a.m. UTC
From: Mark Zhang <markzhang@nvidia.com>

When remove a cm device all ports are removed as well, so all AV's ports
need to be cleared.
This patch adds a cm_id_priv list for each cm_device; For a cm_id when
initializing it's primary AV it is added to this list, so when
removing the device all cm_id's on this list will be removed from this
list and have its av->port and alt_av->port pointer cleared.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cm.c | 75 ++++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 12 deletions(-)

Comments

Jason Gunthorpe April 22, 2021, 7:34 p.m. UTC | #1
On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote:
> @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
>  	cm_dev->going_down = 1;
>  	spin_unlock_irq(&cm.lock);
>  
> +	list_for_each_entry_safe(cm_id_priv, tmp,
> +				 &cm_dev->cm_id_priv_list, cm_dev_list) {
> +		if (!list_empty(&cm_id_priv->cm_dev_list))
> +			list_del(&cm_id_priv->cm_dev_list);
> +		cm_id_priv->av.port = NULL;
> +		cm_id_priv->alt_av.port = NULL;
> +	}

Ugh, this is in the wrong order, it has to be after the work queue
flush..

Hurm, I didn't see an easy way to fix it up, but I did think of a much
better design!

Generally speaking all we need is the memory of the cm_dev and port to
remain active, we don't need to block or fence with cm_remove_one(),
so just stick a memory kref on this thing and keep the memory. The
only things that needs to seralize with cm_remove_one() are on the
workqueue or take a spinlock (eg because they touch mad_agent)

Try this, I didn't finish every detail, applies on top of your series,
but you'll need to reflow it into new commits:

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3feff999a5e003..c26367006a4485 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -205,8 +205,11 @@ struct cm_port {
 };
 
 struct cm_device {
+	struct kref kref;
 	struct list_head list;
+	struct mutex unregistration_lock;
 	struct ib_device *ib_device;
+	unsigned int num_ports;
 	u8 ack_delay;
 	int going_down;
 	struct list_head cm_id_priv_list;
@@ -262,7 +265,6 @@ struct cm_id_private {
 	/* todo: use alternate port on send failure */
 	struct cm_av av;
 	struct cm_av alt_av;
-	rwlock_t av_rwlock;	/* Do not acquire inside cm.lock */
 
 	void *private_data;
 	__be64 tid;
@@ -287,10 +289,23 @@ struct cm_id_private {
 	atomic_t work_count;
 
 	struct rdma_ucm_ece ece;
-
-	struct list_head cm_dev_list;
 };
 
+static void cm_dev_release(struct kref *kref)
+{
+	struct cm_device *cm_dev = container_of(kref, struct cm_device, kref);
+	unsigned int i;
+
+	for (i = 0; i != cm_dev->num_ports; i++)
+		kfree(cm_dev->port[i]);
+	kfree(cm_dev);
+}
+
+static void cm_device_put(struct cm_device *cm_dev)
+{
+	kref_put(&cm_dev->kref, cm_dev_release);
+}
+
 static void cm_work_handler(struct work_struct *work);
 
 static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
@@ -306,12 +321,12 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
 	struct ib_ah *ah;
 	int ret;
 
-	read_lock(&cm_id_priv->av_rwlock);
 	if (!cm_id_priv->av.port) {
 		ret = -EINVAL;
 		goto out;
 	}
 
+	spin_lock(&cm_id_priv->av.port.cm_dev->unregistration_lock);
 	mad_agent = cm_id_priv->av.port->mad_agent;
 	if (!mad_agent) {
 		ret = -EINVAL;
@@ -330,7 +345,6 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
 			       GFP_ATOMIC,
 			       IB_MGMT_BASE_VERSION);
 
-	read_unlock(&cm_id_priv->av_rwlock);
 	if (IS_ERR(m)) {
 		rdma_destroy_ah(ah, 0);
 		ret = PTR_ERR(m);
@@ -346,7 +360,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
 	return m;
 
 out:
-	read_unlock(&cm_id_priv->av_rwlock);
+	spin_unlock(&cm_id_priv->av.port.cm_dev->unregistration_lock);
 	return ERR_PTR(ret);
 }
 
@@ -465,20 +479,18 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv,
 	cm_id_priv->private_data_len = private_data_len;
 }
 
-static void add_cm_id_to_cm_dev_list(struct cm_id_private *cm_id_priv,
-				     struct cm_device *cm_dev)
+static void cm_set_av_port(struct cm_av *av, struct cm_port *port)
 {
-	unsigned long flags;
+	struct cm_port *old_port = av->port;
 
-	spin_lock_irqsave(&cm.lock, flags);
-	if (cm_dev->going_down)
-		goto out;
+	if (old_port == port)
+		return;
 
-	if (!list_empty(&cm_id_priv->cm_dev_list))
-		list_del(&cm_id_priv->cm_dev_list);
-	list_add_tail(&cm_id_priv->cm_dev_list, &cm_dev->cm_id_priv_list);
-out:
-	spin_unlock_irqrestore(&cm.lock, flags);
+	av->port = port;
+	if (old_port)
+		cm_device_put(old_port->cm_dev);
+	if (port)
+		kref_get(&old_port->cm_dev->kref);
 }
 
 static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
@@ -505,11 +517,8 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
 	if (ret)
 		return ret;
 
-	write_lock(&cm_id_priv->av_rwlock);
-	av->port = port;
+	cm_set_av_port(av, port);
 	av->pkey_index = wc->pkey_index;
-	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
-	write_unlock(&cm_id_priv->av_rwlock);
 
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
@@ -521,10 +530,7 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
 {
 	struct cm_av *av = &cm_id_priv->av;
 
-	write_lock(&cm_id_priv->av_rwlock);
-	av->port = port;
-	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
-	write_unlock(&cm_id_priv->av_rwlock);
+	cm_set_av_port(av, port);
 	av->pkey_index = wc->pkey_index;
 	return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
 				       port->port_num, wc,
@@ -588,12 +594,9 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 		return -EINVAL;
 	cm_dev = port->cm_dev;
 
-	read_lock(&cm_id_priv->av_rwlock);
 	if (!is_priv_av &&
 	    (!cm_id_priv->av.port || cm_dev != cm_id_priv->av.port->cm_dev))
 		ret = -EINVAL;
-
-	read_unlock(&cm_id_priv->av_rwlock);
 	if (ret)
 		return ret;
 
@@ -618,13 +621,8 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 	if (ret)
 		return ret;
 
-	write_lock(&cm_id_priv->av_rwlock);
-	av->port = port;
+	cm_set_av_port(av, port);
 	av->timeout = path->packet_life_time + 1;
-	if (is_priv_av)
-		add_cm_id_to_cm_dev_list(cm_id_priv, cm_dev);
-
-	write_unlock(&cm_id_priv->av_rwlock);
 
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
@@ -905,10 +903,8 @@ static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
 	spin_lock_init(&cm_id_priv->lock);
 	init_completion(&cm_id_priv->comp);
 	INIT_LIST_HEAD(&cm_id_priv->work_list);
-	INIT_LIST_HEAD(&cm_id_priv->cm_dev_list);
 	atomic_set(&cm_id_priv->work_count, -1);
 	refcount_set(&cm_id_priv->refcount, 1);
-	rwlock_init(&cm_id_priv->av_rwlock);
 
 	ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b,
 			      &cm.local_id_next, GFP_KERNEL);
@@ -1027,10 +1023,8 @@ static u8 cm_ack_timeout_req(struct cm_id_private *cm_id_priv,
 {
 	u8 ack_delay = 0;
 
-	read_lock(&cm_id_priv->av_rwlock);
-	if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev)
+	if (cm_id_priv->av.port)
 		ack_delay = cm_id_priv->av.port->cm_dev->ack_delay;
-	read_unlock(&cm_id_priv->av_rwlock);
 
 	return cm_ack_timeout(ack_delay, packet_life_time);
 }
@@ -1228,8 +1222,6 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		cm_id_priv->timewait_info = NULL;
 	}
 
-	if (!list_empty(&cm_id_priv->cm_dev_list))
-		list_del(&cm_id_priv->cm_dev_list);
 	WARN_ON(cm_id_priv->listen_sharecount);
 	WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
 	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
@@ -1246,6 +1238,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	rdma_destroy_ah_attr(&cm_id_priv->av.ah_attr);
 	rdma_destroy_ah_attr(&cm_id_priv->alt_av.ah_attr);
 	kfree(cm_id_priv->private_data);
+	cm_set_av_port(&cm_id_priv->av, NULL);
+	cm_set_av_port(&cm_id_priv->alt_av, NULL);
 	kfree_rcu(cm_id_priv, rcu);
 }
 
@@ -1378,10 +1372,13 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
 {
 	u64 hi_tid = 0, low_tid;
 
-	read_lock(&cm_id_priv->av_rwlock);
-	if (cm_id_priv->av.port && cm_id_priv->av.port->mad_agent)
-		hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
-	read_unlock(&cm_id_priv->av_rwlock);
+	if (cm_id_priv->av.port) {
+		spin_lock(&cm_id_priv->av.port->cm_dev->unregistration_lock);
+		if (cm_id_priv->av.port->mad_agent)
+			hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid)
+				 << 32;
+		spin_unlock(&cm_id_priv->av.port->cm_dev->unregistration_lock);
+	}
 
 	low_tid  = (u64)cm_id_priv->id.local_id;
 	return cpu_to_be64(hi_tid | low_tid);
@@ -1879,12 +1876,10 @@ static void cm_format_req_event(struct cm_work *work,
 	param = &work->cm_event.param.req_rcvd;
 	param->listen_id = listen_id;
 	param->bth_pkey = cm_get_bth_pkey(work);
-	read_lock(&cm_id_priv->av_rwlock);
 	if (cm_id_priv->av.port)
 		param->port = cm_id_priv->av.port->port_num;
 	else
 		param->port = 0;
-	read_unlock(&cm_id_priv->av_rwlock);
 	param->primary_path = &work->path[0];
 	cm_opa_to_ib_sgid(work, param->primary_path);
 	if (cm_req_has_alt_path(req_msg)) {
@@ -2311,13 +2306,11 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg,
 	IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn);
 	IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg,
 		param->responder_resources);
-	read_lock(&cm_id_priv->av_rwlock);
 	if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev)
 		IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg,
 			cm_id_priv->av.port->cm_dev->ack_delay);
 	else
 		IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, 0);
-	read_unlock(&cm_id_priv->av_rwlock);
 	IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted);
 	IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count);
 	IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg,
@@ -4187,10 +4180,8 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
 			qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
 						    IB_ACCESS_REMOTE_ATOMIC;
 		qp_attr->pkey_index = cm_id_priv->av.pkey_index;
-		read_lock(&cm_id_priv->av_rwlock);
 		qp_attr->port_num = cm_id_priv->av.port ?
 			cm_id_priv->av.port->port_num : 0;
-		read_unlock(&cm_id_priv->av_rwlock);
 		ret = 0;
 		break;
 	default:
@@ -4234,10 +4225,8 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
 		}
 		if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
 			*qp_attr_mask |= IB_QP_ALT_PATH;
-			read_lock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_port_num = cm_id_priv->alt_av.port ?
 				cm_id_priv->alt_av.port->port_num : 0;
-			read_unlock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
 			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
 			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
@@ -4296,10 +4285,8 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
 			}
 		} else {
 			*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
-			read_lock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_port_num = cm_id_priv->alt_av.port ?
 				cm_id_priv->alt_av.port->port_num : 0;
-			read_unlock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
 			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
 			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
@@ -4417,9 +4404,11 @@ static int cm_add_one(struct ib_device *ib_device)
 	if (!cm_dev)
 		return -ENOMEM;
 
+	kref_init(&cm_dev->kref);
 	cm_dev->ib_device = ib_device;
 	cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
 	cm_dev->going_down = 0;
+	cm_dev->num_ports = ib_device->phys_port_cnt;
 	INIT_LIST_HEAD(&cm_dev->cm_id_priv_list);
 
 	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
@@ -4489,10 +4478,9 @@ static int cm_add_one(struct ib_device *ib_device)
 		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
 		ib_unregister_mad_agent(port->mad_agent);
 		cm_remove_port_fs(port);
-		kfree(port);
 	}
 free:
-	kfree(cm_dev);
+	cm_device_put(cm_dev);
 	return ret;
 }
 
@@ -4515,21 +4503,15 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 	cm_dev->going_down = 1;
 	spin_unlock_irq(&cm.lock);
 
-	list_for_each_entry_safe(cm_id_priv, tmp,
-				 &cm_dev->cm_id_priv_list, cm_dev_list) {
-		write_lock(&cm_id_priv->av_rwlock);
-		if (!list_empty(&cm_id_priv->cm_dev_list))
-			list_del(&cm_id_priv->cm_dev_list);
-		cm_id_priv->av.port = NULL;
-		cm_id_priv->alt_av.port = NULL;
-		write_unlock(&cm_id_priv->av_rwlock);
-	}
-
 	rdma_for_each_port (ib_device, i) {
+		struct ib_mad_agent *mad_agent;
+
 		if (!rdma_cap_ib_cm(ib_device, i))
 			continue;
 
 		port = cm_dev->port[i-1];
+		mad_agent = port->mad_agent;
+
 		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
 		/*
 		 * We flush the queue here after the going_down set, this
@@ -4537,12 +4519,20 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 		 * after that we can call the unregister_mad_agent
 		 */
 		flush_workqueue(cm.wq);
-		ib_unregister_mad_agent(port->mad_agent);
+		/*
+		 * The above ensures no call paths from the work are running,
+		 * the remaining paths all take the unregistration lock
+		 */
+		spin_lock(&cm_dev->unregistration_lock);
+		port->mad_agent = NULL;
+		spin_unlock(&cm_dev->unregistration_lock);
+		ib_unregister_mad_agent(mad_agent);
 		cm_remove_port_fs(port);
-		kfree(port);
 	}
 
-	kfree(cm_dev);
+	/* All touches can only be on call path from the work */
+	cm_dev->ib_device = NULL;
+	cm_device_put(cm_dev);
 }
 
 static int __init ib_cm_init(void)
Mark Zhang April 23, 2021, 1:14 p.m. UTC | #2
On 4/23/2021 3:34 AM, Jason Gunthorpe wrote:
> On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote:
>> @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
>>   	cm_dev->going_down = 1;
>>   	spin_unlock_irq(&cm.lock);
>>   
>> +	list_for_each_entry_safe(cm_id_priv, tmp,
>> +				 &cm_dev->cm_id_priv_list, cm_dev_list) {
>> +		if (!list_empty(&cm_id_priv->cm_dev_list))
>> +			list_del(&cm_id_priv->cm_dev_list);
>> +		cm_id_priv->av.port = NULL;
>> +		cm_id_priv->alt_av.port = NULL;
>> +	}
> 
> Ugh, this is in the wrong order, it has to be after the work queue
> flush..
> 
> Hurm, I didn't see an easy way to fix it up, but I did think of a much
> better design!
> 
> Generally speaking all we need is the memory of the cm_dev and port to
> remain active, we don't need to block or fence with cm_remove_one(),
> so just stick a memory kref on this thing and keep the memory. The
> only things that needs to seralize with cm_remove_one() are on the
> workqueue or take a spinlock (eg because they touch mad_agent)
> 
> Try this, I didn't finish every detail, applies on top of your series,
> but you'll need to reflow it into new commits:

Thanks Jason, I think we still need a rwlock to protect "av->port"? It 
is modified and cleared by cm_set_av_port() and read in many places.
Jason Gunthorpe April 23, 2021, 2:24 p.m. UTC | #3
On Fri, Apr 23, 2021 at 09:14:21PM +0800, Mark Zhang wrote:
> 
> 
> On 4/23/2021 3:34 AM, Jason Gunthorpe wrote:
> > On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote:
> > > @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
> > >   	cm_dev->going_down = 1;
> > >   	spin_unlock_irq(&cm.lock);
> > > +	list_for_each_entry_safe(cm_id_priv, tmp,
> > > +				 &cm_dev->cm_id_priv_list, cm_dev_list) {
> > > +		if (!list_empty(&cm_id_priv->cm_dev_list))
> > > +			list_del(&cm_id_priv->cm_dev_list);
> > > +		cm_id_priv->av.port = NULL;
> > > +		cm_id_priv->alt_av.port = NULL;
> > > +	}
> > 
> > Ugh, this is in the wrong order, it has to be after the work queue
> > flush..
> > 
> > Hurm, I didn't see an easy way to fix it up, but I did think of a much
> > better design!
> > 
> > Generally speaking all we need is the memory of the cm_dev and port to
> > remain active, we don't need to block or fence with cm_remove_one(),
> > so just stick a memory kref on this thing and keep the memory. The
> > only things that needs to seralize with cm_remove_one() are on the
> > workqueue or take a spinlock (eg because they touch mad_agent)
> > 
> > Try this, I didn't finish every detail, applies on top of your series,
> > but you'll need to reflow it into new commits:
> 
> Thanks Jason, I think we still need a rwlock to protect "av->port"? It is
> modified and cleared by cm_set_av_port() and read in many places.

Hum..

This is a real mess.

It looks to me like any access to the av->port should always be
protected by the cm_id_priv->lock

Most already are, but the sets are wrong and a couple readers are wrong

Set reverse call chains:

cm_init_av_for_lap()
 cm_lap_handler(work) (ok)

cm_init_av_for_response()
 cm_req_handler(work) (OK, cm_id_priv is on stack)
 cm_sidr_req_handler(work) (OK, cm_id_priv is on stack)

cm_init_av_by_path()
 cm_req_handler(work) (OK, cm_id_priv is on stack)
 cm_lap_handler(work) (OK)
 ib_send_cm_req() (not locked)
   cma_connect_ib()
    rdma_connect_locked()
     [..]
   ipoib_cm_send_req()
   srp_send_req()
     srp_connect_ch()
      [..]
 ib_send_cm_sidr_req() (not locked)
  cma_resolve_ib_udp()
   rdma_connect_locked()

And
  cm_destroy_id() (locked)

And read reverse call chains:

cm_alloc_msg()
 ib_send_cm_req() (not locked)
 ib_send_cm_rep() (OK)
 ib_send_cm_rtu() (OK)
 cm_send_dreq_locked() (OK)
 cm_send_drep_locked() (OK)
 cm_send_rej_locked() (OK)
 ib_send_cm_mra() (OK)
 ib_send_cm_sidr_req() (not locked)
 cm_send_sidr_rep_locked() (OK)
cm_form_tid()
 cm_format_req()
  ib_send_cm_req() (sort of OK)
 cm_format_dreq()
   cm_send_dreq_locked (OK)
cm_format_req()
  ib_send_cm_req() (sort of OK)
cm_format_req_event()
 cm_req_handler() (OK, cm_id_priv is on stack)
cm_format_rep()
 ib_send_cm_rep() (OK)
cm_rep_handler(work) (OK)
cm_establish_handler(work) (OK)
cm_rtu_handler(work) (OK)
cm_send_dreq_locked() (OK)
cm_dreq_handler(work) (OK)
cm_drep_handler(work) (OK)
cm_rej_handler(work) (OK)
cm_mra_handler(work) (OK)
cm_apr_handler(work) (OK)
cm_sidr_rep_handler(work) (OK)
cm_init_qp_init_attr() (OK)
cm_init_qp_rtr_attr() (OK)
cm_init_qp_rts_attr() (OK)

So.. That leaves these functions that are not obviously locked
correctly:
 ib_send_cm_req()
 ib_send_cm_sidr_req()

And the way their locking expects to work is basically because they
expect that there are not parallel touches to the cm_id - however I'm
doubtful this is completely true.

So no new lock needed, but something should be done about the above
two functions, and this should be documented

Jason
Mark Zhang April 24, 2021, 2:33 a.m. UTC | #4
On 4/23/2021 10:24 PM, Jason Gunthorpe wrote:
> On Fri, Apr 23, 2021 at 09:14:21PM +0800, Mark Zhang wrote:
>>
>>
>> On 4/23/2021 3:34 AM, Jason Gunthorpe wrote:
>>> On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote:
>>>> @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
>>>>    	cm_dev->going_down = 1;
>>>>    	spin_unlock_irq(&cm.lock);
>>>> +	list_for_each_entry_safe(cm_id_priv, tmp,
>>>> +				 &cm_dev->cm_id_priv_list, cm_dev_list) {
>>>> +		if (!list_empty(&cm_id_priv->cm_dev_list))
>>>> +			list_del(&cm_id_priv->cm_dev_list);
>>>> +		cm_id_priv->av.port = NULL;
>>>> +		cm_id_priv->alt_av.port = NULL;
>>>> +	}
>>>
>>> Ugh, this is in the wrong order, it has to be after the work queue
>>> flush..
>>>
>>> Hurm, I didn't see an easy way to fix it up, but I did think of a much
>>> better design!
>>>
>>> Generally speaking all we need is the memory of the cm_dev and port to
>>> remain active, we don't need to block or fence with cm_remove_one(),
>>> so just stick a memory kref on this thing and keep the memory. The
>>> only things that needs to seralize with cm_remove_one() are on the
>>> workqueue or take a spinlock (eg because they touch mad_agent)
>>>
>>> Try this, I didn't finish every detail, applies on top of your series,
>>> but you'll need to reflow it into new commits:
>>
>> Thanks Jason, I think we still need a rwlock to protect "av->port"? It is
>> modified and cleared by cm_set_av_port() and read in many places.
> 
> Hum..
> 
> This is a real mess.
> 
> It looks to me like any access to the av->port should always be
> protected by the cm_id_priv->lock
> 
> Most already are, but the sets are wrong and a couple readers are wrong
> 
> Set reverse call chains:
> 
> cm_init_av_for_lap()
>   cm_lap_handler(work) (ok)
> 
> cm_init_av_for_response()
>   cm_req_handler(work) (OK, cm_id_priv is on stack)
>   cm_sidr_req_handler(work) (OK, cm_id_priv is on stack)
> 
> cm_init_av_by_path()
>   cm_req_handler(work) (OK, cm_id_priv is on stack)
>   cm_lap_handler(work) (OK)
>   ib_send_cm_req() (not locked)
>     cma_connect_ib()
>      rdma_connect_locked()
>       [..]
>     ipoib_cm_send_req()
>     srp_send_req()
>       srp_connect_ch()
>        [..]
>   ib_send_cm_sidr_req() (not locked)
>    cma_resolve_ib_udp()
>     rdma_connect_locked()
> 

Both cm_init_av_for_lap() and cm_init_av_by_path() might sleep (the last 
patch of this series tries to fix this issue), they cannot be called 
with cm_id_priv->lock


> And
>    cm_destroy_id() (locked)
> 
> And read reverse call chains:
> 
> cm_alloc_msg()
>   ib_send_cm_req() (not locked)
>   ib_send_cm_rep() (OK)
>   ib_send_cm_rtu() (OK)
>   cm_send_dreq_locked() (OK)
>   cm_send_drep_locked() (OK)
>   cm_send_rej_locked() (OK)
>   ib_send_cm_mra() (OK)
>   ib_send_cm_sidr_req() (not locked)
>   cm_send_sidr_rep_locked() (OK)
> cm_form_tid()
>   cm_format_req()
>    ib_send_cm_req() (sort of OK)
>   cm_format_dreq()
>     cm_send_dreq_locked (OK)
> cm_format_req()
>    ib_send_cm_req() (sort of OK)
> cm_format_req_event()
>   cm_req_handler() (OK, cm_id_priv is on stack)
> cm_format_rep()
>   ib_send_cm_rep() (OK)
> cm_rep_handler(work) (OK)
> cm_establish_handler(work) (OK)
> cm_rtu_handler(work) (OK)
> cm_send_dreq_locked() (OK)
> cm_dreq_handler(work) (OK)
> cm_drep_handler(work) (OK)
> cm_rej_handler(work) (OK)
> cm_mra_handler(work) (OK)
> cm_apr_handler(work) (OK)
> cm_sidr_rep_handler(work) (OK)
> cm_init_qp_init_attr() (OK)
> cm_init_qp_rtr_attr() (OK)
> cm_init_qp_rts_attr() (OK)
> 
> So.. That leaves these functions that are not obviously locked
> correctly:
>   ib_send_cm_req()
>   ib_send_cm_sidr_req()
> 
> And the way their locking expects to work is basically because they
> expect that there are not parallel touches to the cm_id - however I'm
> doubtful this is completely true.
> 
> So no new lock needed, but something should be done about the above
> two functions, and this should be documented
> 
> Jason
>
Jason Gunthorpe April 26, 2021, 1:56 p.m. UTC | #5
On Sat, Apr 24, 2021 at 10:33:13AM +0800, Mark Zhang wrote:
> > 
> > Set reverse call chains:
> > 
> > cm_init_av_for_lap()
> >   cm_lap_handler(work) (ok)
> > 
> > cm_init_av_for_response()
> >   cm_req_handler(work) (OK, cm_id_priv is on stack)
> >   cm_sidr_req_handler(work) (OK, cm_id_priv is on stack)
> > 
> > cm_init_av_by_path()
> >   cm_req_handler(work) (OK, cm_id_priv is on stack)
> >   cm_lap_handler(work) (OK)
> >   ib_send_cm_req() (not locked)
> >     cma_connect_ib()
> >      rdma_connect_locked()
> >       [..]
> >     ipoib_cm_send_req()
> >     srp_send_req()
> >       srp_connect_ch()
> >        [..]
> >   ib_send_cm_sidr_req() (not locked)
> >    cma_resolve_ib_udp()
> >     rdma_connect_locked()
> > 
> 
> Both cm_init_av_for_lap() 

Well, it is wrong today, look at cm_lap_handler():

	spin_lock_irq(&cm_id_priv->lock);
	[..]
	ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
				 work->mad_recv_wc->recv_buf.grh,
				 &cm_id_priv->av);
	[..]
	cm_queue_work_unlock(cm_id_priv, work);

These need to be restructured, the sleeping calls to extract the
new_ah_attr have to be done before we go into the spinlock.

That is probably the general solution to all the cases, do some work
before the lock and then copy from the stack to the memory under the
spinlock.

Jason
Mark Zhang April 27, 2021, 1:59 a.m. UTC | #6
On 4/26/2021 9:56 PM, Jason Gunthorpe wrote:
> On Sat, Apr 24, 2021 at 10:33:13AM +0800, Mark Zhang wrote:
>>>
>>> Set reverse call chains:
>>>
>>> cm_init_av_for_lap()
>>>    cm_lap_handler(work) (ok)
>>>
>>> cm_init_av_for_response()
>>>    cm_req_handler(work) (OK, cm_id_priv is on stack)
>>>    cm_sidr_req_handler(work) (OK, cm_id_priv is on stack)
>>>
>>> cm_init_av_by_path()
>>>    cm_req_handler(work) (OK, cm_id_priv is on stack)
>>>    cm_lap_handler(work) (OK)
>>>    ib_send_cm_req() (not locked)
>>>      cma_connect_ib()
>>>       rdma_connect_locked()
>>>        [..]
>>>      ipoib_cm_send_req()
>>>      srp_send_req()
>>>        srp_connect_ch()
>>>         [..]
>>>    ib_send_cm_sidr_req() (not locked)
>>>     cma_resolve_ib_udp()
>>>      rdma_connect_locked()
>>>
>>
>> Both cm_init_av_for_lap()
> 
> Well, it is wrong today, look at cm_lap_handler():
> 
> 	spin_lock_irq(&cm_id_priv->lock);
> 	[..]
> 	ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
> 				 work->mad_recv_wc->recv_buf.grh,
> 				 &cm_id_priv->av);
> 	[..]
> 	cm_queue_work_unlock(cm_id_priv, work);
> 
> These need to be restructured, the sleeping calls to extract the
> new_ah_attr have to be done before we go into the spinlock.
> 
> That is probably the general solution to all the cases, do some work
> before the lock and then copy from the stack to the memory under the
> spinlock.

Maybe we can call cm_set_av_port(av, port) outside of cm_init_av_*? So 
that we can apply cm_id_priv->lock when needed.
diff mbox series

Patch

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 2d62c90f9790..f1a24492924f 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -209,6 +209,7 @@  struct cm_device {
 	struct ib_device *ib_device;
 	u8 ack_delay;
 	int going_down;
+	struct list_head cm_id_priv_list;
 	struct cm_port *port[];
 };
 
@@ -285,6 +286,8 @@  struct cm_id_private {
 	atomic_t work_count;
 
 	struct rdma_ucm_ece ece;
+
+	struct list_head cm_dev_list;
 };
 
 static void cm_work_handler(struct work_struct *work);
@@ -440,9 +443,28 @@  static void cm_set_private_data(struct cm_id_private *cm_id_priv,
 	cm_id_priv->private_data_len = private_data_len;
 }
 
+static void add_cm_id_to_cm_dev_list(struct cm_id_private *cm_id_priv,
+				     struct cm_device *cm_dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	if (cm_dev->going_down)
+		goto out;
+
+	if (!list_empty(&cm_id_priv->cm_dev_list))
+		list_del(&cm_id_priv->cm_dev_list);
+	list_add_tail(&cm_id_priv->cm_dev_list, &cm_dev->cm_id_priv_list);
+
+out:
+	spin_unlock_irqrestore(&cm.lock, flags);
+}
+
 static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
-			      struct ib_grh *grh, struct cm_av *av)
+			      struct ib_grh *grh,
+			      struct cm_id_private *cm_id_priv)
 {
+	struct cm_av *av = &cm_id_priv->av;
 	struct rdma_ah_attr new_ah_attr;
 	int ret;
 
@@ -462,14 +484,20 @@  static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
 	if (ret)
 		return ret;
 
+	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
+
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
 }
 
 static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
-				   struct ib_grh *grh, struct cm_av *av)
+				   struct ib_grh *grh,
+				   struct cm_id_private *cm_id_priv)
 {
+	struct cm_av *av = &cm_id_priv->av;
+
 	av->port = port;
+	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
 	av->pkey_index = wc->pkey_index;
 	return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
 				       port->port_num, wc,
@@ -519,11 +547,13 @@  get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr)
 
 static int cm_init_av_by_path(struct sa_path_rec *path,
 			      const struct ib_gid_attr *sgid_attr,
-			      struct cm_av *av)
+			      struct cm_id_private *cm_id_priv,
+			      bool is_priv_av)
 {
 	struct rdma_ah_attr new_ah_attr;
 	struct cm_device *cm_dev;
 	struct cm_port *port;
+	struct cm_av *av;
 	int ret;
 
 	port = get_cm_port_from_path(path, sgid_attr);
@@ -531,6 +561,11 @@  static int cm_init_av_by_path(struct sa_path_rec *path,
 		return -EINVAL;
 	cm_dev = port->cm_dev;
 
+	if (!is_priv_av && cm_dev != cm_id_priv->av.port->cm_dev)
+		return -EINVAL;
+
+	av = is_priv_av ? &cm_id_priv->av : &cm_id_priv->alt_av;
+
 	ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
 				  be16_to_cpu(path->pkey), &av->pkey_index);
 	if (ret)
@@ -554,6 +589,9 @@  static int cm_init_av_by_path(struct sa_path_rec *path,
 
 	av->timeout = path->packet_life_time + 1;
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
+	if (is_priv_av)
+		add_cm_id_to_cm_dev_list(cm_id_priv, cm_dev);
+
 	return 0;
 }
 
@@ -832,6 +870,7 @@  static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
 	spin_lock_init(&cm_id_priv->lock);
 	init_completion(&cm_id_priv->comp);
 	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	INIT_LIST_HEAD(&cm_id_priv->cm_dev_list);
 	atomic_set(&cm_id_priv->work_count, -1);
 	refcount_set(&cm_id_priv->refcount, 1);
 
@@ -1133,6 +1172,8 @@  static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		cm_id_priv->timewait_info = NULL;
 	}
 
+	if (!list_empty(&cm_id_priv->cm_dev_list))
+		list_del(&cm_id_priv->cm_dev_list);
 	WARN_ON(cm_id_priv->listen_sharecount);
 	WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
 	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
@@ -1500,12 +1541,12 @@  int ib_send_cm_req(struct ib_cm_id *cm_id,
 	}
 
 	ret = cm_init_av_by_path(param->primary_path,
-				 param->ppath_sgid_attr, &cm_id_priv->av);
+				 param->ppath_sgid_attr, cm_id_priv, true);
 	if (ret)
 		goto out;
 	if (param->alternate_path) {
 		ret = cm_init_av_by_path(param->alternate_path, NULL,
-					 &cm_id_priv->alt_av);
+					 cm_id_priv, false);
 		if (ret)
 			goto out;
 	}
@@ -2083,7 +2124,7 @@  static int cm_req_handler(struct cm_work *work)
 
 	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
 				      work->mad_recv_wc->recv_buf.grh,
-				      &cm_id_priv->av);
+				      cm_id_priv);
 	if (ret)
 		goto destroy;
 	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
@@ -2137,7 +2178,7 @@  static int cm_req_handler(struct cm_work *work)
 		sa_path_set_dmac(&work->path[0],
 				 cm_id_priv->av.ah_attr.roce.dmac);
 	work->path[0].hop_limit = grh->hop_limit;
-	ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av);
+	ret = cm_init_av_by_path(&work->path[0], gid_attr, cm_id_priv, true);
 	if (ret) {
 		int err;
 
@@ -2156,7 +2197,7 @@  static int cm_req_handler(struct cm_work *work)
 	}
 	if (cm_req_has_alt_path(req_msg)) {
 		ret = cm_init_av_by_path(&work->path[1], NULL,
-					 &cm_id_priv->alt_av);
+					 cm_id_priv, false);
 		if (ret) {
 			ib_send_cm_rej(&cm_id_priv->id,
 				       IB_CM_REJ_INVALID_ALT_GID,
@@ -3328,12 +3369,12 @@  static int cm_lap_handler(struct cm_work *work)
 
 	ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
 				 work->mad_recv_wc->recv_buf.grh,
-				 &cm_id_priv->av);
+				 cm_id_priv);
 	if (ret)
 		goto unlock;
 
 	ret = cm_init_av_by_path(param->alternate_path, NULL,
-				 &cm_id_priv->alt_av);
+				 cm_id_priv, false);
 	if (ret)
 		goto unlock;
 
@@ -3452,7 +3493,7 @@  int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
 
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	ret = cm_init_av_by_path(param->path, param->sgid_attr,
-				 &cm_id_priv->av);
+				 cm_id_priv, true);
 	if (ret)
 		return ret;
 
@@ -3542,7 +3583,7 @@  static int cm_sidr_req_handler(struct cm_work *work)
 	cm_id_priv->av.dgid.global.interface_id = 0;
 	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
 				      work->mad_recv_wc->recv_buf.grh,
-				      &cm_id_priv->av);
+				      cm_id_priv);
 	if (ret)
 		goto out;
 
@@ -4303,6 +4344,7 @@  static int cm_add_one(struct ib_device *ib_device)
 	cm_dev->ib_device = ib_device;
 	cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
 	cm_dev->going_down = 0;
+	INIT_LIST_HEAD(&cm_dev->cm_id_priv_list);
 
 	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
 	rdma_for_each_port (ib_device, i) {
@@ -4381,6 +4423,7 @@  static int cm_add_one(struct ib_device *ib_device)
 static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 {
 	struct cm_device *cm_dev = client_data;
+	struct cm_id_private *cm_id_priv, *tmp;
 	struct cm_port *port;
 	struct ib_port_modify port_modify = {
 		.clr_port_cap_mask = IB_PORT_CM_SUP
@@ -4396,6 +4439,14 @@  static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 	cm_dev->going_down = 1;
 	spin_unlock_irq(&cm.lock);
 
+	list_for_each_entry_safe(cm_id_priv, tmp,
+				 &cm_dev->cm_id_priv_list, cm_dev_list) {
+		if (!list_empty(&cm_id_priv->cm_dev_list))
+			list_del(&cm_id_priv->cm_dev_list);
+		cm_id_priv->av.port = NULL;
+		cm_id_priv->alt_av.port = NULL;
+	}
+
 	rdma_for_each_port (ib_device, i) {
 		if (!rdma_cap_ib_cm(ib_device, i))
 			continue;