diff mbox

uDAPL v2.0: scm: retry socket connect on ECONNREFUSED under heavy load

Message ID E3280858FA94444CA49D2BA02341C983011F10C355@orsmsx506.amr.corp.intel.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Arlin Davis Jan. 5, 2011, 1:04 a.m. UTC
None
diff mbox

Patch

diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index f82d0ff..b95db30 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -390,6 +390,7 @@  static dp_ib_cm_handle_t dapli_cm_alloc(DAPL_EP *ep_ptr)
 
 	cm_ptr->msg.ver = htons(DCM_VER);
 	cm_ptr->socket = DAPL_INVALID_SOCKET;
+	cm_ptr->retry = SCM_CR_RETRY;
 	dapls_cm_acquire(cm_ptr);
 		
 	/* Link EP and CM */
@@ -507,10 +508,11 @@  static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
 				&cm_ptr->addr)->sin_addr), 
 			 ntohs(((struct sockaddr_in *)
 				&cm_ptr->addr)->sin_port),
-			 err == ETIMEDOUT ? "RETRYING...":"ABORTING");
+			 (err == ETIMEDOUT || err == ECONNREFUSED) ? 
+			 "RETRYING...":"ABORTING");
 
 		/* retry a timeout */
-		if (err == ETIMEDOUT) {
+		if ((err == ETIMEDOUT) || (ECONNREFUSED && --cm_ptr->retry)) {
 			closesocket(cm_ptr->socket);
 			cm_ptr->socket = DAPL_INVALID_SOCKET;
 			dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr, 
diff --git a/dapl/openib_scm/dapl_ib_util.h b/dapl/openib_scm/dapl_ib_util.h
index 4bb1a4a..5f9fb43 100644
--- a/dapl/openib_scm/dapl_ib_util.h
+++ b/dapl/openib_scm/dapl_ib_util.h
@@ -40,6 +40,7 @@  struct ib_cm_handle
 	DAPL_OS_LOCK		lock;
 	int			ref_count;
 	int			state;
+	int 			retry;
 	DAPL_SOCKET		socket;
 	struct dapl_hca		*hca;
 	struct dapl_sp		*sp;	
@@ -63,6 +64,7 @@  typedef dp_ib_cm_handle_t	ib_cm_srvc_handle_t;
 #define SCM_ACK_RETRY 7  /* 3 bits, 7 * 268ms = 1.8 seconds */
 #define SCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
 #define SCM_RNR_RETRY 7  /* 3 bits, 7 == infinite */
+#define SCM_CR_RETRY  5  /* retries for busy server, connect refused */
 #define SCM_IB_MTU    2048
 
 /* Global routing defaults */