@@ -180,11 +180,13 @@ dapl_cr_accept(IN DAT_CR_HANDLE cr_handle,
entry_ep_state = ep_ptr->param.ep_state;
entry_ep_handle = cr_ptr->param.local_ep_handle;
ep_ptr->param.ep_state = DAT_EP_STATE_COMPLETION_PENDING;
- ep_ptr->cr_ptr = cr_ptr;
- ep_ptr->param.remote_ia_address_ptr =
- cr_ptr->param.remote_ia_address_ptr;
- cr_ptr->param.local_ep_handle = ep_handle;
+ /* UD supports multiple CR's per EP, provider will manage CR's */
+ if (ep_ptr->param.ep_attr.service_type == DAT_SERVICE_TYPE_RC) {
+ ep_ptr->cr_ptr = cr_ptr;
+ ep_ptr->param.remote_ia_address_ptr = cr_ptr->param.remote_ia_address_ptr;
+ }
+ cr_ptr->param.local_ep_handle = ep_handle;
dapl_os_unlock(&ep_ptr->header.lock);
dat_status = dapls_ib_accept_connection(cr_handle,
@@ -439,25 +439,25 @@ void dapls_cm_free(dp_ib_cm_handle_t cm_ptr)
cm_ptr, dapl_cm_state_str(cm_ptr->state),
cm_ptr->ep, sp_ptr, cm_ptr->ref_count);
+ dapl_os_lock(&cm_ptr->lock);
if (sp_ptr && cm_ptr->state == DCM_CONNECTED &&
cm_ptr->msg.daddr.ib.qp_type == IBV_QPT_UD) {
- DAPL_CR *cr_ptr = dapl_sp_search_cr(sp_ptr, cm_ptr);
+ DAPL_CR *cr_ptr;
+
+ dapl_os_lock(&sp_ptr->header.lock);
+ cr_ptr = dapl_sp_search_cr(sp_ptr, cm_ptr);
if (cr_ptr != NULL) {
- dapl_os_lock(&sp_ptr->header.lock);
dapl_sp_remove_cr(sp_ptr, cr_ptr);
- dapl_os_unlock(&sp_ptr->header.lock);
dapls_cr_free(cr_ptr);
}
+ dapl_os_unlock(&sp_ptr->header.lock);
}
/* free from internal workq, wait until EP is last ref */
- dapl_os_lock(&cm_ptr->lock);
cm_ptr->state = DCM_FREE;
- dapl_os_unlock(&cm_ptr->lock);
- dapli_cm_thread_signal(cm_ptr);
- dapl_os_lock(&cm_ptr->lock);
if (cm_ptr->ref_count != 1) {
+ dapli_cm_thread_signal(cm_ptr);
dapl_os_unlock(&cm_ptr->lock);
dapl_os_wait_object_wait(&cm_ptr->event, DAT_TIMEOUT_INFINITE);
dapl_os_lock(&cm_ptr->lock);
@@ -779,19 +779,19 @@ void dapli_cm_free(dp_ib_cm_handle_t cm)
cm, dapl_cm_state_str(cm->state),
cm->ep, sp_ptr, sp_ptr ? sp_ptr->cr_list_count:0, cm->ref_count);
+ dapl_os_lock(&cm->lock);
if (sp_ptr && cm->state == DCM_CONNECTED &&
cm->msg.daddr.ib.qp_type == IBV_QPT_UD) {
- DAPL_CR *cr_ptr = dapl_sp_search_cr(sp_ptr, cm);
- dapl_log(DAPL_DBG_TYPE_CM, " dapli_cm_free: UD CR %p\n", cr_ptr);
- if (cr_ptr != NULL) {
- dapl_os_lock(&sp_ptr->header.lock);
- dapl_sp_remove_cr(sp_ptr, cr_ptr);
- dapl_os_unlock(&sp_ptr->header.lock);
- dapls_cr_free(cr_ptr);
+ dapl_os_lock(&sp_ptr->header.lock);
+ cm->cr = dapl_sp_search_cr(sp_ptr, cm);
+ dapl_log(DAPL_DBG_TYPE_CM, " dapli_cm_free: UD CR %p\n", cm->cr);
+
+ if (cm->cr != NULL) {
+ dapl_sp_remove_cr(sp_ptr, cm->cr);
+ /* free CR at EP destroy */
}
+ dapl_os_unlock(&sp_ptr->header.lock);
}
-
- dapl_os_lock(&cm->lock);
cm->state = DCM_FREE;
dapls_thread_signal(&cm->hca->ib_trans.signal);
dapl_os_unlock(&cm->lock);
@@ -809,12 +809,12 @@ void dapls_cm_free(dp_ib_cm_handle_t cm)
dapl_os_lock(&cm->lock);
if (cm->state != DCM_FREE)
cm->state = DCM_FREE;
-
- dapl_os_unlock(&cm->lock);
- dapls_thread_signal(&cm->hca->ib_trans.signal);
- dapl_os_lock(&cm->lock);
+ if (cm->cr)
+ dapls_cr_free(cm->cr);
+
if (cm->ref_count != 1) {
+ dapls_thread_signal(&cm->hca->ib_trans.signal);
dapl_os_unlock(&cm->lock);
dapl_os_wait_object_wait(&cm->f_event, DAT_TIMEOUT_INFINITE);
dapl_os_lock(&cm->lock);
@@ -48,6 +48,7 @@ struct ib_cm_handle
struct dapl_hca *hca;
struct dapl_sp *sp;
struct dapl_ep *ep;
+ struct dapl_cr *cr;
struct ibv_ah *ah;
uint16_t p_size; /* accept p_data, for retries */
uint8_t p_data[DCM_MAX_PDATA_SIZE];
1024+ ranks, IMB alltoall may hit assert when running Intel MPI in UD mode. CR cleanup was implemented with EP to CR references still linked. During cr_accept, the CR remote_ia_address is linked to EP object by mistake with UD mode. UD mode my have multiple CRs per EP so no direct mappings to CR memory can exist. Only with RC mode which always has one EP to CR mapping. In scm, ucm: for CM object free with CR references the search and unlinking from SP must be under SP lock to serialize. Also, change cleanup thread wakeup logic to only trigger the thread if reference count indicates the need for more processing. Signed-off-by: Arlin Davis <arlin.r.davis@intel.com> --- dapl/common/dapl_cr_accept.c | 10 ++++++---- dapl/openib_scm/cm.c | 14 +++++++------- dapl/openib_ucm/cm.c | 26 +++++++++++++------------- dapl/openib_ucm/dapl_ib_util.h | 1 + 4 files changed, 27 insertions(+), 24 deletions(-) -- 1.7.3