Message ID | 00c97755c41b06af84f621a1b3e0e8adfe0771cc.1619004798.git.leonro@nvidia.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | Fix memory corruption in CM | expand |
On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote: > @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) > cm_dev->going_down = 1; > spin_unlock_irq(&cm.lock); > > + list_for_each_entry_safe(cm_id_priv, tmp, > + &cm_dev->cm_id_priv_list, cm_dev_list) { > + if (!list_empty(&cm_id_priv->cm_dev_list)) > + list_del(&cm_id_priv->cm_dev_list); > + cm_id_priv->av.port = NULL; > + cm_id_priv->alt_av.port = NULL; > + } Ugh, this is in the wrong order, it has to be after the work queue flush.. Hurm, I didn't see an easy way to fix it up, but I did think of a much better design! Generally speaking all we need is the memory of the cm_dev and port to remain active, we don't need to block or fence with cm_remove_one(), so just stick a memory kref on this thing and keep the memory. The only things that needs to seralize with cm_remove_one() are on the workqueue or take a spinlock (eg because they touch mad_agent) Try this, I didn't finish every detail, applies on top of your series, but you'll need to reflow it into new commits: diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 3feff999a5e003..c26367006a4485 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -205,8 +205,11 @@ struct cm_port { }; struct cm_device { + struct kref kref; struct list_head list; + struct mutex unregistration_lock; struct ib_device *ib_device; + unsigned int num_ports; u8 ack_delay; int going_down; struct list_head cm_id_priv_list; @@ -262,7 +265,6 @@ struct cm_id_private { /* todo: use alternate port on send failure */ struct cm_av av; struct cm_av alt_av; - rwlock_t av_rwlock; /* Do not acquire inside cm.lock */ void *private_data; __be64 tid; @@ -287,10 +289,23 @@ struct cm_id_private { atomic_t work_count; struct rdma_ucm_ece ece; - - struct list_head cm_dev_list; }; +static void cm_dev_release(struct kref *kref) +{ + struct cm_device *cm_dev = container_of(kref, struct cm_device, kref); + unsigned int i; + + for (i = 0; i != cm_dev->num_ports; i++) + kfree(cm_dev->port[i]); + kfree(cm_dev); +} + +static void cm_device_put(struct cm_device *cm_dev) +{ + kref_put(&cm_dev->kref, cm_dev_release); +} + static void cm_work_handler(struct work_struct *work); static inline void cm_deref_id(struct cm_id_private *cm_id_priv) @@ -306,12 +321,12 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) struct ib_ah *ah; int ret; - read_lock(&cm_id_priv->av_rwlock); if (!cm_id_priv->av.port) { ret = -EINVAL; goto out; } + spin_lock(&cm_id_priv->av.port.cm_dev->unregistration_lock); mad_agent = cm_id_priv->av.port->mad_agent; if (!mad_agent) { ret = -EINVAL; @@ -330,7 +345,6 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) GFP_ATOMIC, IB_MGMT_BASE_VERSION); - read_unlock(&cm_id_priv->av_rwlock); if (IS_ERR(m)) { rdma_destroy_ah(ah, 0); ret = PTR_ERR(m); @@ -346,7 +360,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) return m; out: - read_unlock(&cm_id_priv->av_rwlock); + spin_unlock(&cm_id_priv->av.port.cm_dev->unregistration_lock); return ERR_PTR(ret); } @@ -465,20 +479,18 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } -static void add_cm_id_to_cm_dev_list(struct cm_id_private *cm_id_priv, - struct cm_device *cm_dev) +static void cm_set_av_port(struct cm_av *av, struct cm_port *port) { - unsigned long flags; + struct cm_port *old_port = av->port; - spin_lock_irqsave(&cm.lock, flags); - if (cm_dev->going_down) - goto out; + if (old_port == port) + return; - if (!list_empty(&cm_id_priv->cm_dev_list)) - list_del(&cm_id_priv->cm_dev_list); - list_add_tail(&cm_id_priv->cm_dev_list, &cm_dev->cm_id_priv_list); -out: - spin_unlock_irqrestore(&cm.lock, flags); + av->port = port; + if (old_port) + cm_device_put(old_port->cm_dev); + if (port) + kref_get(&old_port->cm_dev->kref); } static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, @@ -505,11 +517,8 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, if (ret) return ret; - write_lock(&cm_id_priv->av_rwlock); - av->port = port; + cm_set_av_port(av, port); av->pkey_index = wc->pkey_index; - add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev); - write_unlock(&cm_id_priv->av_rwlock); rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; @@ -521,10 +530,7 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, { struct cm_av *av = &cm_id_priv->av; - write_lock(&cm_id_priv->av_rwlock); - av->port = port; - add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev); - write_unlock(&cm_id_priv->av_rwlock); + cm_set_av_port(av, port); av->pkey_index = wc->pkey_index; return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, port->port_num, wc, @@ -588,12 +594,9 @@ static int cm_init_av_by_path(struct sa_path_rec *path, return -EINVAL; cm_dev = port->cm_dev; - read_lock(&cm_id_priv->av_rwlock); if (!is_priv_av && (!cm_id_priv->av.port || cm_dev != cm_id_priv->av.port->cm_dev)) ret = -EINVAL; - - read_unlock(&cm_id_priv->av_rwlock); if (ret) return ret; @@ -618,13 +621,8 @@ static int cm_init_av_by_path(struct sa_path_rec *path, if (ret) return ret; - write_lock(&cm_id_priv->av_rwlock); - av->port = port; + cm_set_av_port(av, port); av->timeout = path->packet_life_time + 1; - if (is_priv_av) - add_cm_id_to_cm_dev_list(cm_id_priv, cm_dev); - - write_unlock(&cm_id_priv->av_rwlock); rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; @@ -905,10 +903,8 @@ static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device, spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); - INIT_LIST_HEAD(&cm_id_priv->cm_dev_list); atomic_set(&cm_id_priv->work_count, -1); refcount_set(&cm_id_priv->refcount, 1); - rwlock_init(&cm_id_priv->av_rwlock); ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b, &cm.local_id_next, GFP_KERNEL); @@ -1027,10 +1023,8 @@ static u8 cm_ack_timeout_req(struct cm_id_private *cm_id_priv, { u8 ack_delay = 0; - read_lock(&cm_id_priv->av_rwlock); - if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev) + if (cm_id_priv->av.port) ack_delay = cm_id_priv->av.port->cm_dev->ack_delay; - read_unlock(&cm_id_priv->av_rwlock); return cm_ack_timeout(ack_delay, packet_life_time); } @@ -1228,8 +1222,6 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) cm_id_priv->timewait_info = NULL; } - if (!list_empty(&cm_id_priv->cm_dev_list)) - list_del(&cm_id_priv->cm_dev_list); WARN_ON(cm_id_priv->listen_sharecount); WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node)); if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) @@ -1246,6 +1238,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) rdma_destroy_ah_attr(&cm_id_priv->av.ah_attr); rdma_destroy_ah_attr(&cm_id_priv->alt_av.ah_attr); kfree(cm_id_priv->private_data); + cm_set_av_port(&cm_id_priv->av, NULL); + cm_set_av_port(&cm_id_priv->alt_av, NULL); kfree_rcu(cm_id_priv, rcu); } @@ -1378,10 +1372,13 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) { u64 hi_tid = 0, low_tid; - read_lock(&cm_id_priv->av_rwlock); - if (cm_id_priv->av.port && cm_id_priv->av.port->mad_agent) - hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32; - read_unlock(&cm_id_priv->av_rwlock); + if (cm_id_priv->av.port) { + spin_lock(&cm_id_priv->av.port->cm_dev->unregistration_lock); + if (cm_id_priv->av.port->mad_agent) + hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) + << 32; + spin_unlock(&cm_id_priv->av.port->cm_dev->unregistration_lock); + } low_tid = (u64)cm_id_priv->id.local_id; return cpu_to_be64(hi_tid | low_tid); @@ -1879,12 +1876,10 @@ static void cm_format_req_event(struct cm_work *work, param = &work->cm_event.param.req_rcvd; param->listen_id = listen_id; param->bth_pkey = cm_get_bth_pkey(work); - read_lock(&cm_id_priv->av_rwlock); if (cm_id_priv->av.port) param->port = cm_id_priv->av.port->port_num; else param->port = 0; - read_unlock(&cm_id_priv->av_rwlock); param->primary_path = &work->path[0]; cm_opa_to_ib_sgid(work, param->primary_path); if (cm_req_has_alt_path(req_msg)) { @@ -2311,13 +2306,11 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn); IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg, param->responder_resources); - read_lock(&cm_id_priv->av_rwlock); if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev) IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, cm_id_priv->av.port->cm_dev->ack_delay); else IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, 0); - read_unlock(&cm_id_priv->av_rwlock); IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted); IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count); IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg, @@ -4187,10 +4180,8 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC; qp_attr->pkey_index = cm_id_priv->av.pkey_index; - read_lock(&cm_id_priv->av_rwlock); qp_attr->port_num = cm_id_priv->av.port ? cm_id_priv->av.port->port_num : 0; - read_unlock(&cm_id_priv->av_rwlock); ret = 0; break; default: @@ -4234,10 +4225,8 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, } if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) { *qp_attr_mask |= IB_QP_ALT_PATH; - read_lock(&cm_id_priv->av_rwlock); qp_attr->alt_port_num = cm_id_priv->alt_av.port ? cm_id_priv->alt_av.port->port_num : 0; - read_unlock(&cm_id_priv->av_rwlock); qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; @@ -4296,10 +4285,8 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, } } else { *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE; - read_lock(&cm_id_priv->av_rwlock); qp_attr->alt_port_num = cm_id_priv->alt_av.port ? cm_id_priv->alt_av.port->port_num : 0; - read_unlock(&cm_id_priv->av_rwlock); qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; @@ -4417,9 +4404,11 @@ static int cm_add_one(struct ib_device *ib_device) if (!cm_dev) return -ENOMEM; + kref_init(&cm_dev->kref); cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; + cm_dev->num_ports = ib_device->phys_port_cnt; INIT_LIST_HEAD(&cm_dev->cm_id_priv_list); set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); @@ -4489,10 +4478,9 @@ static int cm_add_one(struct ib_device *ib_device) ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->mad_agent); cm_remove_port_fs(port); - kfree(port); } free: - kfree(cm_dev); + cm_device_put(cm_dev); return ret; } @@ -4515,21 +4503,15 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) cm_dev->going_down = 1; spin_unlock_irq(&cm.lock); - list_for_each_entry_safe(cm_id_priv, tmp, - &cm_dev->cm_id_priv_list, cm_dev_list) { - write_lock(&cm_id_priv->av_rwlock); - if (!list_empty(&cm_id_priv->cm_dev_list)) - list_del(&cm_id_priv->cm_dev_list); - cm_id_priv->av.port = NULL; - cm_id_priv->alt_av.port = NULL; - write_unlock(&cm_id_priv->av_rwlock); - } - rdma_for_each_port (ib_device, i) { + struct ib_mad_agent *mad_agent; + if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; + mad_agent = port->mad_agent; + ib_modify_port(ib_device, port->port_num, 0, &port_modify); /* * We flush the queue here after the going_down set, this @@ -4537,12 +4519,20 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) * after that we can call the unregister_mad_agent */ flush_workqueue(cm.wq); - ib_unregister_mad_agent(port->mad_agent); + /* + * The above ensures no call paths from the work are running, + * the remaining paths all take the unregistration lock + */ + spin_lock(&cm_dev->unregistration_lock); + port->mad_agent = NULL; + spin_unlock(&cm_dev->unregistration_lock); + ib_unregister_mad_agent(mad_agent); cm_remove_port_fs(port); - kfree(port); } - kfree(cm_dev); + /* All touches can only be on call path from the work */ + cm_dev->ib_device = NULL; + cm_device_put(cm_dev); } static int __init ib_cm_init(void)
On 4/23/2021 3:34 AM, Jason Gunthorpe wrote: > On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote: >> @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) >> cm_dev->going_down = 1; >> spin_unlock_irq(&cm.lock); >> >> + list_for_each_entry_safe(cm_id_priv, tmp, >> + &cm_dev->cm_id_priv_list, cm_dev_list) { >> + if (!list_empty(&cm_id_priv->cm_dev_list)) >> + list_del(&cm_id_priv->cm_dev_list); >> + cm_id_priv->av.port = NULL; >> + cm_id_priv->alt_av.port = NULL; >> + } > > Ugh, this is in the wrong order, it has to be after the work queue > flush.. > > Hurm, I didn't see an easy way to fix it up, but I did think of a much > better design! > > Generally speaking all we need is the memory of the cm_dev and port to > remain active, we don't need to block or fence with cm_remove_one(), > so just stick a memory kref on this thing and keep the memory. The > only things that needs to seralize with cm_remove_one() are on the > workqueue or take a spinlock (eg because they touch mad_agent) > > Try this, I didn't finish every detail, applies on top of your series, > but you'll need to reflow it into new commits: Thanks Jason, I think we still need a rwlock to protect "av->port"? It is modified and cleared by cm_set_av_port() and read in many places.
On Fri, Apr 23, 2021 at 09:14:21PM +0800, Mark Zhang wrote: > > > On 4/23/2021 3:34 AM, Jason Gunthorpe wrote: > > On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote: > > > @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) > > > cm_dev->going_down = 1; > > > spin_unlock_irq(&cm.lock); > > > + list_for_each_entry_safe(cm_id_priv, tmp, > > > + &cm_dev->cm_id_priv_list, cm_dev_list) { > > > + if (!list_empty(&cm_id_priv->cm_dev_list)) > > > + list_del(&cm_id_priv->cm_dev_list); > > > + cm_id_priv->av.port = NULL; > > > + cm_id_priv->alt_av.port = NULL; > > > + } > > > > Ugh, this is in the wrong order, it has to be after the work queue > > flush.. > > > > Hurm, I didn't see an easy way to fix it up, but I did think of a much > > better design! > > > > Generally speaking all we need is the memory of the cm_dev and port to > > remain active, we don't need to block or fence with cm_remove_one(), > > so just stick a memory kref on this thing and keep the memory. The > > only things that needs to seralize with cm_remove_one() are on the > > workqueue or take a spinlock (eg because they touch mad_agent) > > > > Try this, I didn't finish every detail, applies on top of your series, > > but you'll need to reflow it into new commits: > > Thanks Jason, I think we still need a rwlock to protect "av->port"? It is > modified and cleared by cm_set_av_port() and read in many places. Hum.. This is a real mess. It looks to me like any access to the av->port should always be protected by the cm_id_priv->lock Most already are, but the sets are wrong and a couple readers are wrong Set reverse call chains: cm_init_av_for_lap() cm_lap_handler(work) (ok) cm_init_av_for_response() cm_req_handler(work) (OK, cm_id_priv is on stack) cm_sidr_req_handler(work) (OK, cm_id_priv is on stack) cm_init_av_by_path() cm_req_handler(work) (OK, cm_id_priv is on stack) cm_lap_handler(work) (OK) ib_send_cm_req() (not locked) cma_connect_ib() rdma_connect_locked() [..] ipoib_cm_send_req() srp_send_req() srp_connect_ch() [..] ib_send_cm_sidr_req() (not locked) cma_resolve_ib_udp() rdma_connect_locked() And cm_destroy_id() (locked) And read reverse call chains: cm_alloc_msg() ib_send_cm_req() (not locked) ib_send_cm_rep() (OK) ib_send_cm_rtu() (OK) cm_send_dreq_locked() (OK) cm_send_drep_locked() (OK) cm_send_rej_locked() (OK) ib_send_cm_mra() (OK) ib_send_cm_sidr_req() (not locked) cm_send_sidr_rep_locked() (OK) cm_form_tid() cm_format_req() ib_send_cm_req() (sort of OK) cm_format_dreq() cm_send_dreq_locked (OK) cm_format_req() ib_send_cm_req() (sort of OK) cm_format_req_event() cm_req_handler() (OK, cm_id_priv is on stack) cm_format_rep() ib_send_cm_rep() (OK) cm_rep_handler(work) (OK) cm_establish_handler(work) (OK) cm_rtu_handler(work) (OK) cm_send_dreq_locked() (OK) cm_dreq_handler(work) (OK) cm_drep_handler(work) (OK) cm_rej_handler(work) (OK) cm_mra_handler(work) (OK) cm_apr_handler(work) (OK) cm_sidr_rep_handler(work) (OK) cm_init_qp_init_attr() (OK) cm_init_qp_rtr_attr() (OK) cm_init_qp_rts_attr() (OK) So.. That leaves these functions that are not obviously locked correctly: ib_send_cm_req() ib_send_cm_sidr_req() And the way their locking expects to work is basically because they expect that there are not parallel touches to the cm_id - however I'm doubtful this is completely true. So no new lock needed, but something should be done about the above two functions, and this should be documented Jason
On 4/23/2021 10:24 PM, Jason Gunthorpe wrote: > On Fri, Apr 23, 2021 at 09:14:21PM +0800, Mark Zhang wrote: >> >> >> On 4/23/2021 3:34 AM, Jason Gunthorpe wrote: >>> On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote: >>>> @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) >>>> cm_dev->going_down = 1; >>>> spin_unlock_irq(&cm.lock); >>>> + list_for_each_entry_safe(cm_id_priv, tmp, >>>> + &cm_dev->cm_id_priv_list, cm_dev_list) { >>>> + if (!list_empty(&cm_id_priv->cm_dev_list)) >>>> + list_del(&cm_id_priv->cm_dev_list); >>>> + cm_id_priv->av.port = NULL; >>>> + cm_id_priv->alt_av.port = NULL; >>>> + } >>> >>> Ugh, this is in the wrong order, it has to be after the work queue >>> flush.. >>> >>> Hurm, I didn't see an easy way to fix it up, but I did think of a much >>> better design! >>> >>> Generally speaking all we need is the memory of the cm_dev and port to >>> remain active, we don't need to block or fence with cm_remove_one(), >>> so just stick a memory kref on this thing and keep the memory. The >>> only things that needs to seralize with cm_remove_one() are on the >>> workqueue or take a spinlock (eg because they touch mad_agent) >>> >>> Try this, I didn't finish every detail, applies on top of your series, >>> but you'll need to reflow it into new commits: >> >> Thanks Jason, I think we still need a rwlock to protect "av->port"? It is >> modified and cleared by cm_set_av_port() and read in many places. > > Hum.. > > This is a real mess. > > It looks to me like any access to the av->port should always be > protected by the cm_id_priv->lock > > Most already are, but the sets are wrong and a couple readers are wrong > > Set reverse call chains: > > cm_init_av_for_lap() > cm_lap_handler(work) (ok) > > cm_init_av_for_response() > cm_req_handler(work) (OK, cm_id_priv is on stack) > cm_sidr_req_handler(work) (OK, cm_id_priv is on stack) > > cm_init_av_by_path() > cm_req_handler(work) (OK, cm_id_priv is on stack) > cm_lap_handler(work) (OK) > ib_send_cm_req() (not locked) > cma_connect_ib() > rdma_connect_locked() > [..] > ipoib_cm_send_req() > srp_send_req() > srp_connect_ch() > [..] > ib_send_cm_sidr_req() (not locked) > cma_resolve_ib_udp() > rdma_connect_locked() > Both cm_init_av_for_lap() and cm_init_av_by_path() might sleep (the last patch of this series tries to fix this issue), they cannot be called with cm_id_priv->lock > And > cm_destroy_id() (locked) > > And read reverse call chains: > > cm_alloc_msg() > ib_send_cm_req() (not locked) > ib_send_cm_rep() (OK) > ib_send_cm_rtu() (OK) > cm_send_dreq_locked() (OK) > cm_send_drep_locked() (OK) > cm_send_rej_locked() (OK) > ib_send_cm_mra() (OK) > ib_send_cm_sidr_req() (not locked) > cm_send_sidr_rep_locked() (OK) > cm_form_tid() > cm_format_req() > ib_send_cm_req() (sort of OK) > cm_format_dreq() > cm_send_dreq_locked (OK) > cm_format_req() > ib_send_cm_req() (sort of OK) > cm_format_req_event() > cm_req_handler() (OK, cm_id_priv is on stack) > cm_format_rep() > ib_send_cm_rep() (OK) > cm_rep_handler(work) (OK) > cm_establish_handler(work) (OK) > cm_rtu_handler(work) (OK) > cm_send_dreq_locked() (OK) > cm_dreq_handler(work) (OK) > cm_drep_handler(work) (OK) > cm_rej_handler(work) (OK) > cm_mra_handler(work) (OK) > cm_apr_handler(work) (OK) > cm_sidr_rep_handler(work) (OK) > cm_init_qp_init_attr() (OK) > cm_init_qp_rtr_attr() (OK) > cm_init_qp_rts_attr() (OK) > > So.. That leaves these functions that are not obviously locked > correctly: > ib_send_cm_req() > ib_send_cm_sidr_req() > > And the way their locking expects to work is basically because they > expect that there are not parallel touches to the cm_id - however I'm > doubtful this is completely true. > > So no new lock needed, but something should be done about the above > two functions, and this should be documented > > Jason >
On Sat, Apr 24, 2021 at 10:33:13AM +0800, Mark Zhang wrote: > > > > Set reverse call chains: > > > > cm_init_av_for_lap() > > cm_lap_handler(work) (ok) > > > > cm_init_av_for_response() > > cm_req_handler(work) (OK, cm_id_priv is on stack) > > cm_sidr_req_handler(work) (OK, cm_id_priv is on stack) > > > > cm_init_av_by_path() > > cm_req_handler(work) (OK, cm_id_priv is on stack) > > cm_lap_handler(work) (OK) > > ib_send_cm_req() (not locked) > > cma_connect_ib() > > rdma_connect_locked() > > [..] > > ipoib_cm_send_req() > > srp_send_req() > > srp_connect_ch() > > [..] > > ib_send_cm_sidr_req() (not locked) > > cma_resolve_ib_udp() > > rdma_connect_locked() > > > > Both cm_init_av_for_lap() Well, it is wrong today, look at cm_lap_handler(): spin_lock_irq(&cm_id_priv->lock); [..] ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); [..] cm_queue_work_unlock(cm_id_priv, work); These need to be restructured, the sleeping calls to extract the new_ah_attr have to be done before we go into the spinlock. That is probably the general solution to all the cases, do some work before the lock and then copy from the stack to the memory under the spinlock. Jason
On 4/26/2021 9:56 PM, Jason Gunthorpe wrote: > On Sat, Apr 24, 2021 at 10:33:13AM +0800, Mark Zhang wrote: >>> >>> Set reverse call chains: >>> >>> cm_init_av_for_lap() >>> cm_lap_handler(work) (ok) >>> >>> cm_init_av_for_response() >>> cm_req_handler(work) (OK, cm_id_priv is on stack) >>> cm_sidr_req_handler(work) (OK, cm_id_priv is on stack) >>> >>> cm_init_av_by_path() >>> cm_req_handler(work) (OK, cm_id_priv is on stack) >>> cm_lap_handler(work) (OK) >>> ib_send_cm_req() (not locked) >>> cma_connect_ib() >>> rdma_connect_locked() >>> [..] >>> ipoib_cm_send_req() >>> srp_send_req() >>> srp_connect_ch() >>> [..] >>> ib_send_cm_sidr_req() (not locked) >>> cma_resolve_ib_udp() >>> rdma_connect_locked() >>> >> >> Both cm_init_av_for_lap() > > Well, it is wrong today, look at cm_lap_handler(): > > spin_lock_irq(&cm_id_priv->lock); > [..] > ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, > work->mad_recv_wc->recv_buf.grh, > &cm_id_priv->av); > [..] > cm_queue_work_unlock(cm_id_priv, work); > > These need to be restructured, the sleeping calls to extract the > new_ah_attr have to be done before we go into the spinlock. > > That is probably the general solution to all the cases, do some work > before the lock and then copy from the stack to the memory under the > spinlock. Maybe we can call cm_set_av_port(av, port) outside of cm_init_av_*? So that we can apply cm_id_priv->lock when needed.
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 2d62c90f9790..f1a24492924f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -209,6 +209,7 @@ struct cm_device { struct ib_device *ib_device; u8 ack_delay; int going_down; + struct list_head cm_id_priv_list; struct cm_port *port[]; }; @@ -285,6 +286,8 @@ struct cm_id_private { atomic_t work_count; struct rdma_ucm_ece ece; + + struct list_head cm_dev_list; }; static void cm_work_handler(struct work_struct *work); @@ -440,9 +443,28 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } +static void add_cm_id_to_cm_dev_list(struct cm_id_private *cm_id_priv, + struct cm_device *cm_dev) +{ + unsigned long flags; + + spin_lock_irqsave(&cm.lock, flags); + if (cm_dev->going_down) + goto out; + + if (!list_empty(&cm_id_priv->cm_dev_list)) + list_del(&cm_id_priv->cm_dev_list); + list_add_tail(&cm_id_priv->cm_dev_list, &cm_dev->cm_id_priv_list); + +out: + spin_unlock_irqrestore(&cm.lock, flags); +} + static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, - struct ib_grh *grh, struct cm_av *av) + struct ib_grh *grh, + struct cm_id_private *cm_id_priv) { + struct cm_av *av = &cm_id_priv->av; struct rdma_ah_attr new_ah_attr; int ret; @@ -462,14 +484,20 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, if (ret) return ret; + add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev); + rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; } static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, - struct ib_grh *grh, struct cm_av *av) + struct ib_grh *grh, + struct cm_id_private *cm_id_priv) { + struct cm_av *av = &cm_id_priv->av; + av->port = port; + add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev); av->pkey_index = wc->pkey_index; return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, port->port_num, wc, @@ -519,11 +547,13 @@ get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr) static int cm_init_av_by_path(struct sa_path_rec *path, const struct ib_gid_attr *sgid_attr, - struct cm_av *av) + struct cm_id_private *cm_id_priv, + bool is_priv_av) { struct rdma_ah_attr new_ah_attr; struct cm_device *cm_dev; struct cm_port *port; + struct cm_av *av; int ret; port = get_cm_port_from_path(path, sgid_attr); @@ -531,6 +561,11 @@ static int cm_init_av_by_path(struct sa_path_rec *path, return -EINVAL; cm_dev = port->cm_dev; + if (!is_priv_av && cm_dev != cm_id_priv->av.port->cm_dev) + return -EINVAL; + + av = is_priv_av ? &cm_id_priv->av : &cm_id_priv->alt_av; + ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num, be16_to_cpu(path->pkey), &av->pkey_index); if (ret) @@ -554,6 +589,9 @@ static int cm_init_av_by_path(struct sa_path_rec *path, av->timeout = path->packet_life_time + 1; rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); + if (is_priv_av) + add_cm_id_to_cm_dev_list(cm_id_priv, cm_dev); + return 0; } @@ -832,6 +870,7 @@ static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device, spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); + INIT_LIST_HEAD(&cm_id_priv->cm_dev_list); atomic_set(&cm_id_priv->work_count, -1); refcount_set(&cm_id_priv->refcount, 1); @@ -1133,6 +1172,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) cm_id_priv->timewait_info = NULL; } + if (!list_empty(&cm_id_priv->cm_dev_list)) + list_del(&cm_id_priv->cm_dev_list); WARN_ON(cm_id_priv->listen_sharecount); WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node)); if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) @@ -1500,12 +1541,12 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, } ret = cm_init_av_by_path(param->primary_path, - param->ppath_sgid_attr, &cm_id_priv->av); + param->ppath_sgid_attr, cm_id_priv, true); if (ret) goto out; if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, NULL, - &cm_id_priv->alt_av); + cm_id_priv, false); if (ret) goto out; } @@ -2083,7 +2124,7 @@ static int cm_req_handler(struct cm_work *work) ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + cm_id_priv); if (ret) goto destroy; cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> @@ -2137,7 +2178,7 @@ static int cm_req_handler(struct cm_work *work) sa_path_set_dmac(&work->path[0], cm_id_priv->av.ah_attr.roce.dmac); work->path[0].hop_limit = grh->hop_limit; - ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av); + ret = cm_init_av_by_path(&work->path[0], gid_attr, cm_id_priv, true); if (ret) { int err; @@ -2156,7 +2197,7 @@ static int cm_req_handler(struct cm_work *work) } if (cm_req_has_alt_path(req_msg)) { ret = cm_init_av_by_path(&work->path[1], NULL, - &cm_id_priv->alt_av); + cm_id_priv, false); if (ret) { ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_ALT_GID, @@ -3328,12 +3369,12 @@ static int cm_lap_handler(struct cm_work *work) ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + cm_id_priv); if (ret) goto unlock; ret = cm_init_av_by_path(param->alternate_path, NULL, - &cm_id_priv->alt_av); + cm_id_priv, false); if (ret) goto unlock; @@ -3452,7 +3493,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, cm_id_priv = container_of(cm_id, struct cm_id_private, id); ret = cm_init_av_by_path(param->path, param->sgid_attr, - &cm_id_priv->av); + cm_id_priv, true); if (ret) return ret; @@ -3542,7 +3583,7 @@ static int cm_sidr_req_handler(struct cm_work *work) cm_id_priv->av.dgid.global.interface_id = 0; ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + cm_id_priv); if (ret) goto out; @@ -4303,6 +4344,7 @@ static int cm_add_one(struct ib_device *ib_device) cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; + INIT_LIST_HEAD(&cm_dev->cm_id_priv_list); set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); rdma_for_each_port (ib_device, i) { @@ -4381,6 +4423,7 @@ static int cm_add_one(struct ib_device *ib_device) static void cm_remove_one(struct ib_device *ib_device, void *client_data) { struct cm_device *cm_dev = client_data; + struct cm_id_private *cm_id_priv, *tmp; struct cm_port *port; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) cm_dev->going_down = 1; spin_unlock_irq(&cm.lock); + list_for_each_entry_safe(cm_id_priv, tmp, + &cm_dev->cm_id_priv_list, cm_dev_list) { + if (!list_empty(&cm_id_priv->cm_dev_list)) + list_del(&cm_id_priv->cm_dev_list); + cm_id_priv->av.port = NULL; + cm_id_priv->alt_av.port = NULL; + } + rdma_for_each_port (ib_device, i) { if (!rdma_cap_ib_cm(ib_device, i)) continue;