diff mbox

[ceph-users] Help needed porting Ceph to RSockets

Message ID 1828884A29C6694DAF28B7E6B8A8237388CF3072@ORSMSX109.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Hefty, Sean Oct. 30, 2013, 11:25 p.m. UTC
Sorry it took so long to get to this.

> after some more analysis and debugging, I found
> workarounds for my problems; I have added these workarounds
> to the last version of the patch for the poll problem by Sean;
> see the attachment to this posting.
> 
> The shutdown() operations below are all SHUT_RDWR.
> 
> 1. shutdown() on side A of a connection waits for close() on side B
> 
>    With rsockets, when a shutdown is done on side A of a socket
>    connection, then the shutdown will only return after side B
>    has done a close() on its end of the connection.
>
>    This is different from TCP/IP sockets: there a shutdown will cause
>    the other end to terminate the connection at the TCP level
>    instantly. The socket changes state into CLOSE_WAIT, which indicates
>    that the application level close is outstanding.
> 
>    In the attached patch, the workaround is in rs_poll_cq(),
>    case RS_OP_CTRL, where for a RS_CTRL_DISCONNECT the rshutdown()
>    is called on side B; this will cause the termination of the
>    socket connection to acknowledged to side A and the shutdown()
>    there can now terminate.

rsockets should behave similar to sockets on this.  I believe that I introduced the problem with my patch to fix the shutdown hang.  The line in question was adding rdma_disconnect() in the rshutdown call.  I replaced the disconnect call with one that simply transitions the underlying QP into an error state.

> 2. double (multiple) shutdown on side A: delay on 2nd shutdown
> 
>    When an application does a shutdown() of side A and does a 2nd
>    shutdown() shortly after (for whatever reason) then the
>    return of the 2nd shutdown() is delayed by 2 seconds.
> 
>    The delay happens in rdma_disconnect(), when this is called
>    from rshutdown() in the case that the rsocket state is
>    rs_disconnected.
> 
>    Even if it could be considered as a bug if an application
>    calls shutdown() twice on the same socket, it still
>    does not make sense to delay that 2nd call to shutdown().

I'd like to avoid calling shutdown internally on the user's behalf.  The internal shutdown may be the reason shutdown ends up being called twice.

I don't understand why you see a 2 second delay, but hopefully removing the overhead of the full rdma_disconnect call fixes things.
 
Can you please try the attached patch in place of all previous patches?

Thanks
- Sean


rsockets: Handle race between rshutdown and rpoll

From: Sean Hefty <sean.hefty@intel.com>

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
---
 src/cma.c     |   23 +++++++++++++----------
 src/cma.h     |    1 +
 src/rsocket.c |   35 +++++++++++++++++++++++++----------
 3 files changed, 39 insertions(+), 20 deletions(-)

Comments

Gandalf Corvotempesta Feb. 5, 2014, 1:58 p.m. UTC | #1
2013-10-31 Hefty, Sean <sean.hefty@intel.com>:
> Can you please try the attached patch in place of all previous patches?

Any updates on ceph with rsockets?
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/cma.c b/src/cma.c
index 374844c..4f41879 100644
--- a/src/cma.c
+++ b/src/cma.c
@@ -1543,22 +1543,25 @@  int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event)
 	return 0;
 }
 
-int rdma_disconnect(struct rdma_cm_id *id)
+int ucma_shutdown(struct rdma_cm_id *id)
 {
-	struct ucma_abi_disconnect cmd;
-	struct cma_id_private *id_priv;
-	int ret;
-
 	switch (id->verbs->device->transport_type) {
 	case IBV_TRANSPORT_IB:
-		ret = ucma_modify_qp_err(id);
-		break;
+		return ucma_modify_qp_err(id);
 	case IBV_TRANSPORT_IWARP:
-		ret = ucma_modify_qp_sqd(id);
-		break;
+		return ucma_modify_qp_sqd(id);
 	default:
-		ret = ERR(EINVAL);
+		return ERR(EINVAL);
 	}
+}
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+	struct ucma_abi_disconnect cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+
+	ret = ucma_shutdown(id);
 	if (ret)
 		return ret;
 
diff --git a/src/cma.h b/src/cma.h
index e944a9a..4c991b4 100644
--- a/src/cma.h
+++ b/src/cma.h
@@ -150,6 +150,7 @@  void ucma_set_sid(enum rdma_port_space ps, struct sockaddr *addr,
 		  struct sockaddr_ib *sib);
 int ucma_max_qpsize(struct rdma_cm_id *id);
 int ucma_complete(struct rdma_cm_id *id);
+int ucma_shutdown(struct rdma_cm_id *id);
 
 static inline int ERR(int err)
 {
diff --git a/src/rsocket.c b/src/rsocket.c
index d544dd0..c7a491b 100644
--- a/src/rsocket.c
+++ b/src/rsocket.c
@@ -1822,7 +1822,12 @@  static int rs_poll_cq(struct rsocket *rs)
 					rs->state = rs_disconnected;
 					return 0;
 				} else if (rs_msg_data(msg) == RS_CTRL_SHUTDOWN) {
-					rs->state &= ~rs_readable;
+					if (rs->state & rs_writable) {
+						rs->state &= ~rs_readable;
+					} else {
+						rs->state = rs_disconnected;
+						return 0;
+					}
 				}
 				break;
 			case RS_OP_WRITE:
@@ -2948,10 +2953,12 @@  static int rs_poll_events(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds)
 
 		rs = idm_lookup(&idm, fds[i].fd);
 		if (rs) {
+			fastlock_acquire(&rs->cq_wait_lock);
 			if (rs->type == SOCK_STREAM)
 				rs_get_cq_event(rs);
 			else
 				ds_get_cq_event(rs);
+			fastlock_release(&rs->cq_wait_lock);
 			fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all);
 		} else {
 			fds[i].revents = rfds[i].revents;
@@ -3098,7 +3105,8 @@  int rselect(int nfds, fd_set *readfds, fd_set *writefds,
 
 /*
  * For graceful disconnect, notify the remote side that we're
- * disconnecting and wait until all outstanding sends complete.
+ * disconnecting and wait until all outstanding sends complete, provided
+ * that the remote side has not sent a disconnect message.
  */
 int rshutdown(int socket, int how)
 {
@@ -3106,11 +3114,6 @@  int rshutdown(int socket, int how)
 	int ctrl, ret = 0;
 
 	rs = idm_at(&idm, socket);
-	if (how == SHUT_RD) {
-		rs->state &= ~rs_readable;
-		return 0;
-	}
-
 	if (rs->fd_flags & O_NONBLOCK)
 		rs_set_nonblocking(rs, 0);
 
@@ -3118,15 +3121,20 @@  int rshutdown(int socket, int how)
 		if (how == SHUT_RDWR) {
 			ctrl = RS_CTRL_DISCONNECT;
 			rs->state &= ~(rs_readable | rs_writable);
-		} else {
+		} else if (how == SHUT_WR) {
 			rs->state &= ~rs_writable;
 			ctrl = (rs->state & rs_readable) ?
 				RS_CTRL_SHUTDOWN : RS_CTRL_DISCONNECT;
+		} else {
+			rs->state &= ~rs_readable;
+			if (rs->state & rs_writable)
+				goto out;
+			ctrl = RS_CTRL_DISCONNECT;
 		}
 		if (!rs->ctrl_avail) {
 			ret = rs_process_cq(rs, 0, rs_conn_can_send_ctrl);
 			if (ret)
-				return ret;
+				goto out;
 		}
 
 		if ((rs->state & rs_connected) && rs->ctrl_avail) {
@@ -3138,10 +3146,17 @@  int rshutdown(int socket, int how)
 	if (rs->state & rs_connected)
 		rs_process_cq(rs, 0, rs_conn_all_sends_done);
 
+out:
 	if ((rs->fd_flags & O_NONBLOCK) && (rs->state & rs_connected))
 		rs_set_nonblocking(rs, rs->fd_flags);
 
-	return 0;
+	if (rs->state & rs_disconnected) {
+		/* Generate event by flushing receives to unblock rpoll */
+		ibv_req_notify_cq(rs->cm_id->recv_cq, 0);
+		ucma_shutdown(rs->cm_id);
+	}
+
+	return ret;
 }
 
 static void ds_shutdown(struct rsocket *rs)