diff mbox

[5/7] librdmacm/rspreload: Do not block connect when supporting fork

Message ID 1828884A29C6694DAF28B7E6B8A8237346A8997B@ORSMSX101.amr.corp.intel.com (mailing list archive)
State Accepted
Headers show

Commit Message

Hefty, Sean Aug. 16, 2012, 7:24 p.m. UTC
Many FTP servers require fork support.  However, FTP clients,
such as ncftp, will perform the following call sequence:

send PASV request to server over connection 1
         server will listen for connection 2
issue nonblocking connect to server
send ACCEPT request to server over connection 1
         server will accept connection 2

The current fork support converts all nonblocking connect
calls to blocking.  The result is that the FTP client ends up
blocked waiting for the server to accept the connection,
which it will never do.

To handle this case, we have the active side follow the same
rule as the server side and defer establishing the rsocket
connection until the user calls the first data transfer routine.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
---
 src/preload.c |  141 +++++++++++++++++++++++++++++++++++++++------------------
 1 files changed, 96 insertions(+), 45 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Sridhar Samudrala Aug. 18, 2012, 12:40 a.m. UTC | #1
On Thu, 2012-08-16 at 19:24 +0000, Hefty, Sean wrote:
> Many FTP servers require fork support.  However, FTP clients,
> such as ncftp, will perform the following call sequence:
> 
> send PASV request to server over connection 1
>          server will listen for connection 2
> issue nonblocking connect to server
> send ACCEPT request to server over connection 1
>          server will accept connection 2
> 
> The current fork support converts all nonblocking connect
> calls to blocking.  The result is that the FTP client ends up
> blocked waiting for the server to accept the connection,
> which it will never do.
> 
> To handle this case, we have the active side follow the same
> rule as the server side and defer establishing the rsocket
> connection until the user calls the first data transfer routine.

I just updated the repository and noticed that we are falling back to
normal sockets even with preload.
After some debugging, found a couple of issuse. See inline.

Thanks
Sridhar

> 
> Signed-off-by: Sean Hefty <sean.hefty@intel.com>
> ---
.....

> @@ -453,37 +486,57 @@ int accept(int socket, struct sockaddr *addr, socklen_t *addrlen)
>   * We can't fork RDMA connections and pass them from the parent to the child
>   * process.  Instead, we need to establish the RDMA connection after calling
>   * fork.  To do this, we delay establishing the RDMA connection until we try
> - * to send/receive on the server side.  On the client side, we don't expect
> - * to fork, so we switch from a TCP connection to an rsocket when connecting.
> + * to send/receive on the server side.
>   */
> -static int fork_active(int socket, const struct sockaddr *addr, socklen_t addrlen)
> +static void fork_active(int socket)
>  {
> -	int fd, ret;
> +	struct sockaddr_storage addr;
> +	int sfd, dfd, ret;
> +	socklen_t len;
>  	uint32_t msg;
>  	long flags;
> 
> -	fd = fd_getd(socket);
> -	flags = real.fcntl(fd, F_GETFL);
> -	real.fcntl(fd, F_SETFL, 0);
> -	ret = real.connect(fd, addr, addrlen);
> +	sfd = fd_getd(socket);
> +
> +	len = sizeof addr;
> +	ret = real.getpeername(sfd, (struct sockaddr *) &addr, &len);
>  	if (ret)
> -		return ret;
> +		goto err1;
> 
> -	ret = real.recv(fd, &msg, sizeof msg, MSG_PEEK);
> -	if ((ret != sizeof msg) || msg) {
> -		fd_store(socket, fd, fd_normal);
> -		return 0;
> -	}
> +	dfd = rsocket(addr.ss_family, SOCK_STREAM, 0);
> +	if (dfd < 0)
> +		goto err1;
> 
> -	real.fcntl(fd, F_SETFL, flags);
> -	ret = transpose_socket(socket, fd_rsocket);
> -	if (ret < 0)
> -		return ret;
> +	flags = real.fcntl(sfd, F_GETFL);
> +	real.fcntl(sfd, F_SETFL, 0);
> +	ret = real.recv(sfd, &msg, sizeof msg, MSG_PEEK);
> +	real.fcntl(sfd, F_SETFL, flags);
> +	if ((ret != sizeof msg) || msg)
> +		goto err2;
> +
> +	ret = rconnect(ret, (struct sockaddr *) &addr, len);

The first parameter to rconnect is incorrect. It should be dfd

> +	if (ret)
> +		goto err2;
> 
> -	real.close(fd);
> -	return rconnect(ret, addr, addrlen);
> +	set_rsocket_options(dfd);
> +	copysockopts(dfd, sfd, &rs, &real);
> +	real.shutdown(sfd, SHUT_RDWR);
> +	real.close(sfd);
> +	fd_store(socket, dfd, fd_rsocket, fd_ready);
> +	return;
> +
> +err2:
> +	rclose(dfd);
> +err1:
> +	fd_store(socket, sfd, fd_normal, fd_ready);
>  }
> 
> +/*
> + * The server will start listening for the new connection, then send a
> + * message to the active side when the listen is ready.  This does leave
> + * fork unsupported in the following case: the server is nonblocking and
> + * calls select/poll waiting to receive data from the client.
> + */
>  static void fork_passive(int socket)
>  {
>  	struct sockaddr_in6 sin6;
> @@ -541,7 +594,7 @@ static void fork_passive(int socket)
>  	copysockopts(dfd, sfd, &rs, &real);
>  	real.shutdown(sfd, SHUT_RDWR);
>  	real.close(sfd);
> -	fd_store(socket, dfd, fd_rsocket);
> +	fd_store(socket, dfd, fd_rsocket, fd_ready);
> 
>  lclose:
>  	rclose(lfd);
> @@ -550,7 +603,7 @@ sclose:
>  	sem_close(sem);
>  out:
>  	if (ret)
> -		fd_store(socket, sfd, fd_normal);
> +		fd_store(socket, sfd, fd_normal, fd_ready);
>  }
> 
>  static inline enum fd_type fd_fork_get(int index, int *fd)
> @@ -559,8 +612,10 @@ static inline enum fd_type fd_fork_get(int index, int *fd)
> 
>  	fdi = idm_lookup(&idm, index);
>  	if (fdi) {
> -		if (fdi->type == fd_fork)
> +		if (fdi->type == fd_fork_passive)

fd_fork is no longer a fd_type. So this should be fdi->state

>  			fork_passive(index);
> +		else if (fdi->type == fd_fork_active)

Same here

> +			fork_active(index);
>  		*fd = fdi->fd;
>  		return fdi->type;
> 
> @@ -574,10 +629,7 @@ int connect(int socket, const struct sockaddr *addr, socklen_t addrlen)
>  {
>  	int fd, ret;
> 
> -	switch (fd_get(socket, &fd)) {
> -	case fd_fork:
> -		return fork_active(socket, addr, addrlen);
> -	case fd_rsocket:
> +	if (fd_get(socket, &fd) == fd_rsocket) {
>  		ret = rconnect(fd, addr, addrlen);
>  		if (!ret || errno == EINPROGRESS)
>  			return ret;
> @@ -588,9 +640,8 @@ int connect(int socket, const struct sockaddr *addr, socklen_t addrlen)
> 
>  		rclose(fd);
>  		fd = ret;
> -		break;
> -	default:
> -		break;
> +	} else if (fd_gets(socket) == fd_fork) {
> +		fd_store(socket, fd, fd_normal, fd_fork_active);
>  	}
> 
>  	return real.connect(fd, addr, addrlen);
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean Aug. 20, 2012, 8:22 p.m. UTC | #2
PiBJIGp1c3QgdXBkYXRlZCB0aGUgcmVwb3NpdG9yeSBhbmQgbm90aWNlZCB0aGF0IHdlIGFyZSBm
YWxsaW5nIGJhY2sgdG8NCj4gbm9ybWFsIHNvY2tldHMgZXZlbiB3aXRoIHByZWxvYWQuDQo+IEFm
dGVyIHNvbWUgZGVidWdnaW5nLCBmb3VuZCBhIGNvdXBsZSBvZiBpc3N1c2UuIFNlZSBpbmxpbmUu
DQoNClRoYW5rcyBmb3IgdGhlIHJlcG9ydC4NCg0KSSd2ZSBwdXNoZWQgYSBwYXRjaCB0byB0aGUg
cmVwb3NpdG9yeSB0byBhZGRyZXNzIHRob3NlIGlzc3VlcywgYWxvbmcgd2l0aCBvbmUgb3RoZXIg
cHJvYmxlbSB0aGF0IEkgd2FzIGFibGUgdG8gaGl0LiAgSWYgdGhlIGNvbm5lY3Rpb24gaXMgbm9u
YmxvY2tpbmcsIHRoZW4gZm9ya19hY3RpdmUoKSBjYW4gYmUgY2FsbGVkIGJlZm9yZSB0aGUgY29u
bmVjdGlvbiBoYXMgY29tcGxldGVkLCB3aGljaCBjYW4gcmVzdWx0IGluIGdldHBlZXJuYW1lKCkg
ZmFpbGluZy4NCg0KSSdtIHN0aWxsIG5vdCBhYmxlIHRvIHJlcHJvZHVjZSB0aGUgaGFuZyB0aGF0
IHlvdSB3ZXJlIHNlZWluZywgYnV0IEkgY2FuIHJlcHJvZHVjZSB0aGUgcHJvYmxlbSB3aGVyZSB0
aGUgdGhyb3VnaHB1dCBzdHJvbmdseSBmYXZvcnMgdGhlIGZpcnN0IHByb2Nlc3MgdG8gY29ubmVj
dCBvdmVyIGEgc2Vjb25kIHByb2Nlc3MuICBJJ20gbG9va2luZyBpbnRvIHRoYXQuDQoNCi0gU2Vh
bg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala Aug. 21, 2012, 5:43 p.m. UTC | #3
On Mon, 2012-08-20 at 20:22 +0000, Hefty, Sean wrote:
> > I just updated the repository and noticed that we are falling back to
> > normal sockets even with preload.
> > After some debugging, found a couple of issuse. See inline.
> 
> Thanks for the report.
> 
> I've pushed a patch to the repository to address those issues, along with one other problem that I was able to hit.  If the connection is nonblocking, then fork_active() can be called before the connection has completed, which can result in getpeername() failing.
> 
> I'm still not able to reproduce the hang that you were seeing, but I can reproduce the problem where the throughput strongly favors the first process to connect over a second process.  I'm looking into that.

I am seeing another hang with 2 clients doing bi-directional traffic to a server.
I am consistently seeing this hang with the attached test programs.

You can start the server using
  tcp_server -d -e

and run 2 clients
  tcp_client -h <server-ip> -n 10000 -d -e

Can you try it out with your IB setup? If you don't see this hang, then it is possible
that the issues i am seeing are RoCE specific.

Thanks
Sridhar
Hefty, Sean Aug. 21, 2012, 6 p.m. UTC | #4
> I am seeing another hang with 2 clients doing bi-directional traffic to a

> server.

> I am consistently seeing this hang with the attached test programs.

> 

> You can start the server using

>   tcp_server -d -e

> 

> and run 2 clients

>   tcp_client -h <server-ip> -n 10000 -d -e

> 

> Can you try it out with your IB setup? If you don't see this hang, then it is

> possible

> that the issues i am seeing are RoCE specific.


After looking into netperf more, I think it is hanging for me.  When running multiple clients simultaneously, any client beyond the first after forking stops sending after 300-400 messages.  I haven't checked how much data had transferred, so it may be the same as what you reported.  I'm debugging this.

I don't have tcp_client/server installed, so I'll need to get those.  Does the server call fork?
Sridhar Samudrala Aug. 21, 2012, 6:20 p.m. UTC | #5
On Tue, 2012-08-21 at 18:00 +0000, Hefty, Sean wrote:
> > I am seeing another hang with 2 clients doing bi-directional traffic to a
> > server.
> > I am consistently seeing this hang with the attached test programs.
> > 
> > You can start the server using
> >   tcp_server -d -e
> > 
> > and run 2 clients
> >   tcp_client -h <server-ip> -n 10000 -d -e
> > 
> > Can you try it out with your IB setup? If you don't see this hang, then it is
> > possible
> > that the issues i am seeing are RoCE specific.
> 
> After looking into netperf more, I think it is hanging for me.  When running multiple clients simultaneously, any client beyond the first after forking stops sending after 300-400 messages.  I haven't checked how much data had transferred, so it may be the same as what you reported.  I'm debugging this.
> 
> I don't have tcp_client/server installed, so I'll need to get those.  Does the server call fork?

These are my test programs and i attached them to my previous email.
Yes, the server calls fork.

Thanks
Sridhar



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/preload.c b/src/preload.c
index bb8e3fb..8b86415 100644
--- a/src/preload.c
+++ b/src/preload.c
@@ -99,12 +99,20 @@  static int fork_support;
 
 enum fd_type {
 	fd_normal,
-	fd_rsocket,
-	fd_fork
+	fd_rsocket
+};
+
+enum fd_fork_state {
+	fd_ready,
+	fd_fork,
+	fd_fork_listen,
+	fd_fork_active,
+	fd_fork_passive
 };
 
 struct fd_info {
 	enum fd_type type;
+	enum fd_fork_state state;
 	int fd;
 	int dupfd;
 	atomic_t refcnt;
@@ -143,13 +151,14 @@  err1:
 	return ret;
 }
 
-static void fd_store(int index, int fd, enum fd_type type)
+static void fd_store(int index, int fd, enum fd_type type, enum fd_fork_state state)
 {
 	struct fd_info *fdi;
 
 	fdi = idm_at(&idm, index);
 	fdi->fd = fd;
 	fdi->type = type;
+	fdi->state = state;
 }
 
 static inline enum fd_type fd_get(int index, int *fd)
@@ -175,6 +184,14 @@  static inline int fd_getd(int index)
 	return fdi ? fdi->fd : index;
 }
 
+static inline enum fd_fork_state fd_gets(int index)
+{
+	struct fd_info *fdi;
+
+	fdi = idm_lookup(&idm, index);
+	return fdi ? fdi->state : fd_ready;
+}
+
 static inline enum fd_type fd_gett(int index)
 {
 	struct fd_info *fdi;
@@ -353,7 +370,7 @@  static int transpose_socket(int socket, enum fd_type new_type)
 	if (ret)
 		goto err;
 
-	fd_store(socket, dfd, new_type);
+	fd_store(socket, dfd, new_type, fd_ready);
 	return dfd;
 
 err:
@@ -398,9 +415,9 @@  int socket(int domain, int type, int protocol)
 			ret = real.socket(domain, type, protocol);
 			if (ret < 0)
 				return ret;
-			fd_store(index, ret, fd_fork);
+			fd_store(index, ret, fd_normal, fd_fork);
 		} else {
-			fd_store(index, ret, fd_rsocket);
+			fd_store(index, ret, fd_rsocket, fd_ready);
 			set_rsocket_options(ret);
 		}
 		return index;
@@ -419,30 +436,46 @@  int bind(int socket, const struct sockaddr *addr, socklen_t addrlen)
 
 int listen(int socket, int backlog)
 {
-	int fd;
-	return (fd_get(socket, &fd) == fd_rsocket) ?
-		rlisten(fd, backlog) : real.listen(fd, backlog);
+	int fd, ret;
+	if (fd_get(socket, &fd) == fd_rsocket) {
+		ret = rlisten(fd, backlog);
+	} else {
+		ret = real.listen(fd, backlog);
+		if (!ret && fd_gets(socket) == fd_fork)
+			fd_store(socket, fd, fd_normal, fd_fork_listen);
+	}
+	return ret;
 }
 
 int accept(int socket, struct sockaddr *addr, socklen_t *addrlen)
 {
 	int fd, index, ret;
-	enum fd_type type;
 
-	type = fd_get(socket, &fd);
-	if (type == fd_rsocket || type == fd_fork) {
+	if (fd_get(socket, &fd) == fd_rsocket) {
 		index = fd_open();
 		if (index < 0)
 			return index;
 
-		ret = (type == fd_rsocket) ? raccept(fd, addr, addrlen) :
-					     real.accept(fd, addr, addrlen);
+		ret = raccept(fd, addr, addrlen);
 		if (ret < 0) {
 			fd_close(index, &fd);
 			return ret;
 		}
 
-		fd_store(index, ret, type);
+		fd_store(index, ret, fd_rsocket, fd_ready);
+		return index;
+	} else if (fd_gets(socket) == fd_fork_listen) {
+		index = fd_open();
+		if (index < 0)
+			return index;
+
+		ret = real.accept(fd, addr, addrlen);
+		if (ret < 0) {
+			fd_close(index, &fd);
+			return ret;
+		}
+
+		fd_store(index, ret, fd_normal, fd_fork_passive);
 		return index;
 	} else {
 		return real.accept(fd, addr, addrlen);
@@ -453,37 +486,57 @@  int accept(int socket, struct sockaddr *addr, socklen_t *addrlen)
  * We can't fork RDMA connections and pass them from the parent to the child
  * process.  Instead, we need to establish the RDMA connection after calling
  * fork.  To do this, we delay establishing the RDMA connection until we try
- * to send/receive on the server side.  On the client side, we don't expect
- * to fork, so we switch from a TCP connection to an rsocket when connecting.
+ * to send/receive on the server side.
  */
-static int fork_active(int socket, const struct sockaddr *addr, socklen_t addrlen)
+static void fork_active(int socket)
 {
-	int fd, ret;
+	struct sockaddr_storage addr;
+	int sfd, dfd, ret;
+	socklen_t len;
 	uint32_t msg;
 	long flags;
 
-	fd = fd_getd(socket);
-	flags = real.fcntl(fd, F_GETFL);
-	real.fcntl(fd, F_SETFL, 0);
-	ret = real.connect(fd, addr, addrlen);
+	sfd = fd_getd(socket);
+
+	len = sizeof addr;
+	ret = real.getpeername(sfd, (struct sockaddr *) &addr, &len);
 	if (ret)
-		return ret;
+		goto err1;
 
-	ret = real.recv(fd, &msg, sizeof msg, MSG_PEEK);
-	if ((ret != sizeof msg) || msg) {
-		fd_store(socket, fd, fd_normal);
-		return 0;
-	}
+	dfd = rsocket(addr.ss_family, SOCK_STREAM, 0);
+	if (dfd < 0)
+		goto err1;
 
-	real.fcntl(fd, F_SETFL, flags);
-	ret = transpose_socket(socket, fd_rsocket);
-	if (ret < 0)
-		return ret;
+	flags = real.fcntl(sfd, F_GETFL);
+	real.fcntl(sfd, F_SETFL, 0);
+	ret = real.recv(sfd, &msg, sizeof msg, MSG_PEEK);
+	real.fcntl(sfd, F_SETFL, flags);
+	if ((ret != sizeof msg) || msg)
+		goto err2;
+
+	ret = rconnect(ret, (struct sockaddr *) &addr, len);
+	if (ret)
+		goto err2;
 
-	real.close(fd);
-	return rconnect(ret, addr, addrlen);
+	set_rsocket_options(dfd);
+	copysockopts(dfd, sfd, &rs, &real);
+	real.shutdown(sfd, SHUT_RDWR);
+	real.close(sfd);
+	fd_store(socket, dfd, fd_rsocket, fd_ready);
+	return;
+
+err2:
+	rclose(dfd);
+err1:
+	fd_store(socket, sfd, fd_normal, fd_ready);
 }
 
+/*
+ * The server will start listening for the new connection, then send a
+ * message to the active side when the listen is ready.  This does leave
+ * fork unsupported in the following case: the server is nonblocking and
+ * calls select/poll waiting to receive data from the client.
+ */
 static void fork_passive(int socket)
 {
 	struct sockaddr_in6 sin6;
@@ -541,7 +594,7 @@  static void fork_passive(int socket)
 	copysockopts(dfd, sfd, &rs, &real);
 	real.shutdown(sfd, SHUT_RDWR);
 	real.close(sfd);
-	fd_store(socket, dfd, fd_rsocket);
+	fd_store(socket, dfd, fd_rsocket, fd_ready);
 
 lclose:
 	rclose(lfd);
@@ -550,7 +603,7 @@  sclose:
 	sem_close(sem);
 out:
 	if (ret)
-		fd_store(socket, sfd, fd_normal);
+		fd_store(socket, sfd, fd_normal, fd_ready);
 }
 
 static inline enum fd_type fd_fork_get(int index, int *fd)
@@ -559,8 +612,10 @@  static inline enum fd_type fd_fork_get(int index, int *fd)
 
 	fdi = idm_lookup(&idm, index);
 	if (fdi) {
-		if (fdi->type == fd_fork)
+		if (fdi->type == fd_fork_passive)
 			fork_passive(index);
+		else if (fdi->type == fd_fork_active)
+			fork_active(index);
 		*fd = fdi->fd;
 		return fdi->type;
 
@@ -574,10 +629,7 @@  int connect(int socket, const struct sockaddr *addr, socklen_t addrlen)
 {
 	int fd, ret;
 
-	switch (fd_get(socket, &fd)) {
-	case fd_fork:
-		return fork_active(socket, addr, addrlen);
-	case fd_rsocket:
+	if (fd_get(socket, &fd) == fd_rsocket) {
 		ret = rconnect(fd, addr, addrlen);
 		if (!ret || errno == EINPROGRESS)
 			return ret;
@@ -588,9 +640,8 @@  int connect(int socket, const struct sockaddr *addr, socklen_t addrlen)
 
 		rclose(fd);
 		fd = ret;
-		break;
-	default:
-		break;
+	} else if (fd_gets(socket) == fd_fork) {
+		fd_store(socket, fd, fd_normal, fd_fork_active);
 	}
 
 	return real.connect(fd, addr, addrlen);