@@ -225,6 +225,7 @@ typedef uint16_t ib_hca_port_t;
#define DCM_RETRY_CNT 10
#define DCM_REP_TIME 800 /* reply timeout in m_secs */
#define DCM_RTU_TIME 800 /* rtu timeout in m_secs */
+#define DCM_WAIT_TIME 60000 /* wait timeout in m_secs */
#define DCM_QP_SIZE 500 /* uCM tx, rx qp size */
#define DCM_CQ_SIZE 500 /* uCM cq size */
#define DCM_TX_BURST 50 /* uCM signal, every TX burst msgs posted */
@@ -231,38 +231,26 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int *timer)
*timer = cm->hca->ib_trans.cm_timer;
if ((time - cm->timer)/1000 >
(cm->hca->ib_trans.rtu_time << cm->retries)) {
- dapl_log(DAPL_DBG_TYPE_CM,
- " CM_TIMEWAIT %d %p [lid, port, cqp, iqp]:"
- " %x %x %x %x -> %x %x %x %x r_pid %x"
- " Time(ms) %d > %d\n",
- cm->retries+1, cm,
- ntohs(cm->msg.saddr.ib.lid), ntohs(cm->msg.sport),
- ntohl(cm->msg.sqpn), ntohl(cm->msg.saddr.ib.qpn),
- ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn), ntohl(cm->msg.daddr.ib.qpn),
- ntohl(cm->msg.d_id),
- (time - cm->timer)/1000,
- cm->hca->ib_trans.rtu_time << cm->retries);
cm->retries++;
- }
- if (cm->retries > 2) {
- dapl_log(DAPL_DBG_TYPE_CM_WARN,
- " CM_TIMEWAIT EXPIRED %d %p [lid, port, cqp, iqp]:"
- " %x %x %x %x -> %x %x %x %x r_pid %x"
- " Time(ms) %d > %d\n",
- cm->retries+1, cm,
- ntohs(cm->msg.saddr.ib.lid), ntohs(cm->msg.sport),
- ntohl(cm->msg.sqpn), ntohl(cm->msg.saddr.ib.qpn),
- ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn), ntohl(cm->msg.daddr.ib.qpn),
- ntohl(cm->msg.d_id),
- (time - cm->timer)/1000,
- cm->hca->ib_trans.rtu_time << cm->retries);
- cm->ah = NULL; /* consumer will free AH */
- cm->state = DCM_FREE;
- dapl_os_unlock(&cm->lock);
- dapl_ep_unlink_cm(cm->ep, cm); /* last CM ref */
- return;
+ if ((time - cm->timer)/1000 > cm->hca->ib_trans.wait_time) {
+ dapl_log(DAPL_DBG_TYPE_CM_WARN,
+ " CM_TIMEWAIT EXPIRED %d %p [lid, port, cqp, iqp]:"
+ " %x %x %x %x -> %x %x %x %x r_pid %x"
+ " Time(ms) %d > %d\n",
+ cm->retries+1, cm,
+ ntohs(cm->msg.saddr.ib.lid), ntohs(cm->msg.sport),
+ ntohl(cm->msg.sqpn), ntohl(cm->msg.saddr.ib.qpn),
+ ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport),
+ ntohl(cm->msg.dqpn), ntohl(cm->msg.daddr.ib.qpn),
+ ntohl(cm->msg.d_id),
+ (time - cm->timer)/1000,
+ cm->hca->ib_trans.wait_time);
+ cm->ah = NULL; /* consumer will free AH */
+ cm->state = DCM_FREE;
+ dapl_os_unlock(&cm->lock);
+ dapl_ep_unlink_cm(cm->ep, cm); /* last CM ref */
+ return;
+ }
}
break;
@@ -737,7 +725,7 @@ void dapls_cm_release(dp_ib_cm_handle_t cm)
dapl_os_lock(&cm->lock);
cm->ref_count--;
if (cm->ref_count) {
- if (cm->ref_count == 1)
+ if ((cm->ref_count == 1) && (cm->list_entry.list_head))
dapl_os_wait_object_wakeup(&cm->f_event);
dapl_os_unlock(&cm->lock);
return;
@@ -101,6 +101,7 @@ typedef struct _ib_hca_transport
int cm_timer;
int rep_time;
int rtu_time;
+ int wait_time;
DAPL_OS_LOCK slock;
int s_hd;
int s_tl;
@@ -504,12 +504,11 @@ static int ucm_service_create(IN DAPL_HCA *hca)
int hlen = sizeof(struct ibv_grh); /* hdr included with UD recv */
char *rbuf;
- dapl_dbg_log(DAPL_DBG_TYPE_UTIL, " ucm_create: \n");
-
/* setup CM timers and queue sizes */
tp->retries = dapl_os_get_env_val("DAPL_UCM_RETRY", DCM_RETRY_CNT);
tp->rep_time = dapl_os_get_env_val("DAPL_UCM_REP_TIME", DCM_REP_TIME);
tp->rtu_time = dapl_os_get_env_val("DAPL_UCM_RTU_TIME", DCM_RTU_TIME);
+ tp->wait_time = dapl_os_get_env_val("DAPL_UCM_WAIT_TIME", DCM_WAIT_TIME);
tp->cm_timer = DAPL_MIN(tp->rep_time,tp->rtu_time);
tp->qpe = dapl_os_get_env_val("DAPL_UCM_QP_SIZE", DCM_QP_SIZE);
tp->cqe = dapl_os_get_env_val("DAPL_UCM_CQ_SIZE", DCM_CQ_SIZE);
@@ -519,8 +518,10 @@ static int ucm_service_create(IN DAPL_HCA *hca)
goto bail;
dapl_log(DAPL_DBG_TYPE_UTIL,
- " create_service: pd %p ctx %p handle 0x%x\n",
- tp->pd, tp->pd->context, tp->pd->handle);
+ " UCM: CM service - pd %p ctx %p "
+ " Timers(ms): req %d rtu %d wait %d\n",
+ tp->pd, tp->pd->context, tp->rep_time,
+ tp->rtu_time, tp->wait_time);
tp->rch = ibv_create_comp_channel(hca->ib_hca_handle);
if (!tp->rch)