diff mbox

[[PATCH,v1] 05/37] [CIFS] SMBD: Implement API for upper layer to create SMBD transport and establish RDMA connection

Message ID 1501704648-20159-6-git-send-email-longli@exchange.microsoft.com (mailing list archive)
State New, archived
Headers show

Commit Message

Long Li Aug. 2, 2017, 8:10 p.m. UTC
From: Long Li <longli@microsoft.com>

Implement the code for connecting to SMBD server. The client and server are connected using RC Queue Pair over RDMA API, which suppports Infiniband, RoCE and iWARP. Upper layer code can call cifs_create_rdma_session to establish a SMBD RDMA connection.

Signed-off-by: Long Li <longli@microsoft.com>
---
 fs/cifs/cifsrdma.c | 257 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/cifsrdma.h |  14 +++
 2 files changed, 271 insertions(+)

Comments

Tom Talpey Aug. 14, 2017, 7:54 p.m. UTC | #1
> -----Original Message-----
> From: linux-cifs-owner@vger.kernel.org [mailto:linux-cifs-
> owner@vger.kernel.org] On Behalf Of Long Li
> Sent: Wednesday, August 2, 2017 4:10 PM
> To: Steve French <sfrench@samba.org>; linux-cifs@vger.kernel.org; samba-
> technical@lists.samba.org; linux-kernel@vger.kernel.org
> Cc: Long Li <longli@microsoft.com>
> Subject: [[PATCH v1] 05/37] [CIFS] SMBD: Implement API for upper layer to
> create SMBD transport and establish RDMA connection
> 
> From: Long Li <longli@microsoft.com>
> 
> Implement the code for connecting to SMBD server. The client and server are
> connected using RC Queue Pair over RDMA API, which suppports Infiniband,
> RoCE and iWARP. Upper layer code can call cifs_create_rdma_session to
> establish a SMBD RDMA connection.
> 
> +/* Upcall from RDMA CM */
> +static int cifs_rdma_conn_upcall(
> +               struct rdma_cm_id *id, struct rdma_cm_event *event)
> +{
> +       struct cifs_rdma_info *info = id->context;
> +
> +       log_rdma_event("event=%d status=%d\n", event->event, event->status);
> +
> +       switch (event->event) {
> +       case RDMA_CM_EVENT_ADDR_RESOLVED:
> +       case RDMA_CM_EVENT_ROUTE_RESOLVED:
> +               info->ri_rc = 0;
> +               complete(&info->ri_done);
> +               break;
> +
> +       case RDMA_CM_EVENT_ADDR_ERROR:
> +               info->ri_rc = -EHOSTUNREACH;
> +               complete(&info->ri_done);
> +               break;
> +
> +       case RDMA_CM_EVENT_ROUTE_ERROR:
> +               info->ri_rc = -ENETUNREACH;
> +               complete(&info->ri_done);
> +               break;
> +
> +       case RDMA_CM_EVENT_ESTABLISHED:
> +       case RDMA_CM_EVENT_CONNECT_ERROR:
> +       case RDMA_CM_EVENT_UNREACHABLE:
> +       case RDMA_CM_EVENT_REJECTED:
> +       case RDMA_CM_EVENT_DEVICE_REMOVAL:
> +               log_rdma_event("connected event=%d\n", event->event);
> +               info->connect_state = event->event;
> +               break;
> +
> +       case RDMA_CM_EVENT_DISCONNECTED:
> +               break;
> +
> +       default:
> +               break;
> +       }
> +
> +       return 0;
> +}

This code looks a lot like the connection stuff in the NFS/RDMA RPC transport.
Does your code have the same needs? If so, you might consider moving this to
a common RDMA handler.

> +/* Upcall from RDMA QP */
> +static void
> +cifs_rdma_qp_async_error_upcall(struct ib_event *event, void *context)
> +{
> +       struct cifs_rdma_info *info = context;
> +       log_rdma_event("%s on device %s info %p\n",
> +               ib_event_msg(event->event), event->device->name, info);
> +
> +       switch (event->event)
> +       {
> +       case IB_EVENT_CQ_ERR:
> +       case IB_EVENT_QP_FATAL:
> +       case IB_EVENT_QP_REQ_ERR:
> +       case IB_EVENT_QP_ACCESS_ERR:
> +
> +       default:
> +               break;
> +       }
> +}

Ditto. But, what's up with the empty switch(event->event) processing?

> +static struct rdma_cm_id* cifs_rdma_create_id(
> +               struct cifs_rdma_info *info, struct sockaddr *dstaddr)
> +{
...
> +       log_rdma_event("connecting to IP %pI4 port %d\n",
> +               &addr_in->sin_addr, ntohs(addr_in->sin_port));
>... and then...
> +       if (dstaddr->sa_family == AF_INET6)
> +               sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
> +       else
> +               sport = &((struct sockaddr_in *)dstaddr)->sin_port;
> +
> +       *sport = htons(445);
...and
> +out:
> +       // try port number 5445 if port 445 doesn't work
> +       if (*sport == htons(445)) {
> +               *sport = htons(5445);
> +               goto try_again;
> +       }

Suggest rearranging the log_rdma_event() call to reflect reality.

The IANA-assigned port for SMB Direct is 5445, and port 445 will be
listening on TCP. Should you really be probing that port before 5445?
I suggest not doing so unconditionally.

> +struct cifs_rdma_info* cifs_create_rdma_session(
> +       struct TCP_Server_Info *server, struct sockaddr *dstaddr)
> +{
> ...
> +       int max_pending = receive_credit_max + send_credit_target;
>...
> +       if (max_pending > info->id->device->attrs.max_cqe ||
> +           max_pending > info->id->device->attrs.max_qp_wr) {
> +               log_rdma_event("consider lowering receive_credit_max and "
> +                       "send_credit_target. Possible CQE overrun, device "
> +                       "reporting max_cpe %d max_qp_wr %d\n",
> +                       info->id->device->attrs.max_cqe,
> +                       info->id->device->attrs.max_qp_wr);
> +               goto out2;
> +       }

I don't understand this. Why are you directing both Receive and Send completions
to the same CQ, won't that make it very hard to manage completions and their
interrupts? Also, what device(s) have you seen trigger this log? CQ's are generally
allowed to be quite large.

> +       conn_param.responder_resources = 32;
> +       if (info->id->device->attrs.max_qp_rd_atom < 32)
> +               conn_param.responder_resources =
> +                       info->id->device->attrs.max_qp_rd_atom;
> +       conn_param.retry_count = 6;
> +       conn_param.rnr_retry_count = 6;
> +       conn_param.flow_control = 0;

These choices warrant explanation. 32 responder resources is on the large side for
most datacenter networks. And because of SMB Direct's credits, why is RNR_retry not
simply zero?


--
To unsubscribe from this list: send the line "unsubscribe linux-cifs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Long Li Aug. 30, 2017, 2:35 a.m. UTC | #2
> -----Original Message-----
> From: Tom Talpey
> Sent: Monday, August 14, 2017 12:55 PM
> To: Long Li <longli@microsoft.com>; Steve French <sfrench@samba.org>;
> linux-cifs@vger.kernel.org; samba-technical@lists.samba.org; linux-
> kernel@vger.kernel.org; linux-rdma@vger.kernel.org
> Subject: RE: [[PATCH v1] 05/37] [CIFS] SMBD: Implement API for upper layer
> to create SMBD transport and establish RDMA connection
> 
> > -----Original Message-----
> > From: linux-cifs-owner@vger.kernel.org [mailto:linux-cifs-
> > owner@vger.kernel.org] On Behalf Of Long Li
> > Sent: Wednesday, August 2, 2017 4:10 PM
> > To: Steve French <sfrench@samba.org>; linux-cifs@vger.kernel.org;
> > samba- technical@lists.samba.org; linux-kernel@vger.kernel.org
> > Cc: Long Li <longli@microsoft.com>
> > Subject: [[PATCH v1] 05/37] [CIFS] SMBD: Implement API for upper layer
> > to create SMBD transport and establish RDMA connection
> >
> > From: Long Li <longli@microsoft.com>
> >
> > Implement the code for connecting to SMBD server. The client and
> > server are connected using RC Queue Pair over RDMA API, which
> > suppports Infiniband, RoCE and iWARP. Upper layer code can call
> > cifs_create_rdma_session to establish a SMBD RDMA connection.
> >
> > +/* Upcall from RDMA CM */
> > +static int cifs_rdma_conn_upcall(
> > +               struct rdma_cm_id *id, struct rdma_cm_event *event) {
> > +       struct cifs_rdma_info *info = id->context;
> > +
> > +       log_rdma_event("event=%d status=%d\n", event->event,
> > + event->status);
> > +
> > +       switch (event->event) {
> > +       case RDMA_CM_EVENT_ADDR_RESOLVED:
> > +       case RDMA_CM_EVENT_ROUTE_RESOLVED:
> > +               info->ri_rc = 0;
> > +               complete(&info->ri_done);
> > +               break;
> > +
> > +       case RDMA_CM_EVENT_ADDR_ERROR:
> > +               info->ri_rc = -EHOSTUNREACH;
> > +               complete(&info->ri_done);
> > +               break;
> > +
> > +       case RDMA_CM_EVENT_ROUTE_ERROR:
> > +               info->ri_rc = -ENETUNREACH;
> > +               complete(&info->ri_done);
> > +               break;
> > +
> > +       case RDMA_CM_EVENT_ESTABLISHED:
> > +       case RDMA_CM_EVENT_CONNECT_ERROR:
> > +       case RDMA_CM_EVENT_UNREACHABLE:
> > +       case RDMA_CM_EVENT_REJECTED:
> > +       case RDMA_CM_EVENT_DEVICE_REMOVAL:
> > +               log_rdma_event("connected event=%d\n", event->event);
> > +               info->connect_state = event->event;
> > +               break;
> > +
> > +       case RDMA_CM_EVENT_DISCONNECTED:
> > +               break;
> > +
> > +       default:
> > +               break;
> > +       }
> > +
> > +       return 0;
> > +}
> 
> This code looks a lot like the connection stuff in the NFS/RDMA RPC transport.
> Does your code have the same needs? If so, you might consider moving this
> to a common RDMA handler.

> 
> > +/* Upcall from RDMA QP */
> > +static void
> > +cifs_rdma_qp_async_error_upcall(struct ib_event *event, void
> > +*context) {
> > +       struct cifs_rdma_info *info = context;
> > +       log_rdma_event("%s on device %s info %p\n",
> > +               ib_event_msg(event->event), event->device->name,
> > +info);
> > +
> > +       switch (event->event)
> > +       {
> > +       case IB_EVENT_CQ_ERR:
> > +       case IB_EVENT_QP_FATAL:
> > +       case IB_EVENT_QP_REQ_ERR:
> > +       case IB_EVENT_QP_ACCESS_ERR:
> > +
> > +       default:
> > +               break;
> > +       }
> > +}
> 
> Ditto. But, what's up with the empty switch(event->event) processing?

I have changed to code to disconnect RDMA connection on QP errors.


> 
> > +static struct rdma_cm_id* cifs_rdma_create_id(
> > +               struct cifs_rdma_info *info, struct sockaddr *dstaddr)
> > +{
> ...
> > +       log_rdma_event("connecting to IP %pI4 port %d\n",
> > +               &addr_in->sin_addr, ntohs(addr_in->sin_port));
> >... and then...
> > +       if (dstaddr->sa_family == AF_INET6)
> > +               sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
> > +       else
> > +               sport = &((struct sockaddr_in *)dstaddr)->sin_port;
> > +
> > +       *sport = htons(445);
> ...and
> > +out:
> > +       // try port number 5445 if port 445 doesn't work
> > +       if (*sport == htons(445)) {
> > +               *sport = htons(5445);
> > +               goto try_again;
> > +       }
> 
> Suggest rearranging the log_rdma_event() call to reflect reality.
> 
> The IANA-assigned port for SMB Direct is 5445, and port 445 will be listening
> on TCP. Should you really be probing that port before 5445?
> I suggest not doing so unconditionally.

This part is reworked in V3 to behave as you suggested.

> 
> > +struct cifs_rdma_info* cifs_create_rdma_session(
> > +       struct TCP_Server_Info *server, struct sockaddr *dstaddr) {
> > ...
> > +       int max_pending = receive_credit_max + send_credit_target;
> >...
> > +       if (max_pending > info->id->device->attrs.max_cqe ||
> > +           max_pending > info->id->device->attrs.max_qp_wr) {
> > +               log_rdma_event("consider lowering receive_credit_max and "
> > +                       "send_credit_target. Possible CQE overrun, device "
> > +                       "reporting max_cpe %d max_qp_wr %d\n",
> > +                       info->id->device->attrs.max_cqe,
> > +                       info->id->device->attrs.max_qp_wr);
> > +               goto out2;
> > +       }
> 
> I don't understand this. Why are you directing both Receive and Send
> completions to the same CQ, won't that make it very hard to manage
> completions and their interrupts? Also, what device(s) have you seen trigger
> this log? CQ's are generally allowed to be quite large.

I have moved them to separate completion queues in V3.

> 
> > +       conn_param.responder_resources = 32;
> > +       if (info->id->device->attrs.max_qp_rd_atom < 32)
> > +               conn_param.responder_resources =
> > +                       info->id->device->attrs.max_qp_rd_atom;
> > +       conn_param.retry_count = 6;
> > +       conn_param.rnr_retry_count = 6;
> > +       conn_param.flow_control = 0;
> 
> These choices warrant explanation. 32 responder resources is on the large
> side for most datacenter networks. And because of SMB Direct's credits, why
> is RNR_retry not simply zero?
> 

Those have been fixed. On Mellanox ConnectX3, the attrs.max_qp_rd_atom is 16. So I choose a default value 32 to work with hardware that can potentially support more responder resources.
--
To unsubscribe from this list: send the line "unsubscribe linux-cifs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/cifs/cifsrdma.c b/fs/cifs/cifsrdma.c
index 7c4c178..b18fb79 100644
--- a/fs/cifs/cifsrdma.c
+++ b/fs/cifs/cifsrdma.c
@@ -120,3 +120,260 @@  do {									\
 			atomic_read(&info->send_credits),		\
 			info->send_credit_target);			\
 } while (0)
+
+/* Upcall from RDMA CM */
+static int cifs_rdma_conn_upcall(
+		struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct cifs_rdma_info *info = id->context;
+
+	log_rdma_event("event=%d status=%d\n", event->event, event->status);
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		info->ri_rc = 0;
+		complete(&info->ri_done);
+		break;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		info->ri_rc = -EHOSTUNREACH;
+		complete(&info->ri_done);
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		info->ri_rc = -ENETUNREACH;
+		complete(&info->ri_done);
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_REJECTED:
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		log_rdma_event("connected event=%d\n", event->event);
+		info->connect_state = event->event;
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/* Upcall from RDMA QP */
+static void
+cifs_rdma_qp_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct cifs_rdma_info *info = context;
+	log_rdma_event("%s on device %s info %p\n",
+		ib_event_msg(event->event), event->device->name, info);
+
+	switch (event->event)
+	{
+	case IB_EVENT_CQ_ERR:
+	case IB_EVENT_QP_FATAL:
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_ACCESS_ERR:
+
+	default:
+		break;
+	}
+}
+
+static struct rdma_cm_id* cifs_rdma_create_id(
+		struct cifs_rdma_info *info, struct sockaddr *dstaddr)
+{
+	struct rdma_cm_id *id;
+	int rc;
+	struct sockaddr_in *addr_in = (struct sockaddr_in*) dstaddr;
+	__be16 *sport;
+
+	log_rdma_event("connecting to IP %pI4 port %d\n",
+		&addr_in->sin_addr, ntohs(addr_in->sin_port));
+
+	id = rdma_create_id(&init_net, cifs_rdma_conn_upcall, info,
+		RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(id)) {
+		rc = PTR_ERR(id);
+		log_rdma_event("rdma_create_id() failed %i\n", rc);
+		return id;
+	}
+
+	if (dstaddr->sa_family == AF_INET6)
+		sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
+	else
+		sport = &((struct sockaddr_in *)dstaddr)->sin_port;
+
+	*sport = htons(445);
+try_again:
+	init_completion(&info->ri_done);
+	info->ri_rc = -ETIMEDOUT;
+	rc = rdma_resolve_addr(id, NULL, (struct sockaddr*)dstaddr, 5000);
+	if (rc) {
+		log_rdma_event("rdma_resolve_addr() failed %i\n", rc);
+		goto out;
+	}
+	wait_for_completion_interruptible_timeout(
+		&info->ri_done, msecs_to_jiffies(8000));
+	rc = info->ri_rc;
+	if (rc) {
+		log_rdma_event("rdma_resolve_addr() completed %i\n", rc);
+		goto out;
+	}
+
+	info->ri_rc = -ETIMEDOUT;
+	rc = rdma_resolve_route(id, 5000);
+	if (rc) {
+		log_rdma_event("rdma_resolve_route() failed %i\n", rc);
+		goto out;
+	}
+	wait_for_completion_interruptible_timeout(
+		&info->ri_done, msecs_to_jiffies(8000));
+	rc = info->ri_rc;
+	if (rc) {
+		log_rdma_event("rdma_resolve_route() completed %i\n", rc);
+		goto out;
+	}
+
+	return id;
+
+out:
+	// try port number 5445 if port 445 doesn't work
+	if (*sport == htons(445)) {
+		*sport = htons(5445);
+		goto try_again;
+	}
+	rdma_destroy_id(id);
+	return ERR_PTR(rc);
+}
+
+static int cifs_rdma_ia_open(
+		struct cifs_rdma_info *info, struct sockaddr *dstaddr)
+{
+	int rc;
+
+	info->id = cifs_rdma_create_id(info, dstaddr);
+	if (IS_ERR(info->id)) {
+		rc = PTR_ERR(info->id);
+		goto out1;
+	}
+
+	info->pd = ib_alloc_pd(info->id->device, 0);
+	if (IS_ERR(info->pd)) {
+		rc = PTR_ERR(info->pd);
+		log_rdma_event("ib_alloc_pd() returned %d\n", rc);
+		goto out2;
+	}
+
+	return 0;
+
+out2:
+	rdma_destroy_id(info->id);
+	info->id = NULL;
+
+out1:
+	return rc;
+}
+
+struct cifs_rdma_info* cifs_create_rdma_session(
+	struct TCP_Server_Info *server, struct sockaddr *dstaddr)
+{
+	int rc;
+	struct cifs_rdma_info *info;
+	struct rdma_conn_param conn_param;
+	struct ib_qp_init_attr qp_attr;
+	int max_pending = receive_credit_max + send_credit_target;
+
+	info = kzalloc(sizeof(struct cifs_rdma_info), GFP_KERNEL);
+	if (!info)
+		return NULL;
+
+	info->server_info = server;
+
+	rc = cifs_rdma_ia_open(info, dstaddr);
+	if (rc) {
+		log_rdma_event("cifs_rdma_ia_open rc=%d\n", rc);
+		goto out1;
+	}
+
+	if (max_pending > info->id->device->attrs.max_cqe ||
+	    max_pending > info->id->device->attrs.max_qp_wr) {
+		log_rdma_event("consider lowering receive_credit_max and "
+			"send_credit_target. Possible CQE overrun, device "
+			"reporting max_cpe %d max_qp_wr %d\n",
+			info->id->device->attrs.max_cqe,
+			info->id->device->attrs.max_qp_wr);
+		goto out2;
+	}
+
+	info->receive_credit_max = receive_credit_max;
+	info->send_credit_target = send_credit_target;
+	info->max_send_size = max_send_size;
+	info->max_fragmented_recv_size = max_fragmented_recv_size;
+	info->max_receive_size = max_receive_size;
+
+	max_send_sge = min_t(int, max_send_sge,
+		info->id->device->attrs.max_sge);
+	max_recv_sge = min_t(int, max_recv_sge,
+		info->id->device->attrs.max_sge_rd);
+
+	info->cq = ib_alloc_cq(info->id->device, info,
+			info->receive_credit_max + info->send_credit_target,
+			0, IB_POLL_SOFTIRQ);
+	if (IS_ERR(info->cq))
+		goto out2;
+
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.event_handler = cifs_rdma_qp_async_error_upcall;
+	qp_attr.qp_context = info;
+	qp_attr.cap.max_send_wr = info->send_credit_target;
+	qp_attr.cap.max_recv_wr = info->receive_credit_max;
+	qp_attr.cap.max_send_sge = max_send_sge;
+	qp_attr.cap.max_recv_sge = max_recv_sge;
+	qp_attr.cap.max_inline_data = 0;
+	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_attr.qp_type = IB_QPT_RC;
+	qp_attr.send_cq = info->cq;
+	qp_attr.recv_cq = info->cq;
+	qp_attr.port_num = ~0;
+
+	rc = rdma_create_qp(info->id, info->pd, &qp_attr);
+	if (rc) {
+		log_rdma_event("rdma_create_qp failed %i\n", rc);
+		rc = -ENETUNREACH;
+		goto out2;
+	}
+
+	memset(&conn_param, 0, sizeof(conn_param));
+	conn_param.private_data = NULL;
+	conn_param.private_data_len = 0;
+	conn_param.initiator_depth = 0;
+	conn_param.responder_resources = 32;
+	if (info->id->device->attrs.max_qp_rd_atom < 32)
+		conn_param.responder_resources =
+			info->id->device->attrs.max_qp_rd_atom;
+	conn_param.retry_count = 6;
+	conn_param.rnr_retry_count = 6;
+	conn_param.flow_control = 0;
+	rc = rdma_connect(info->id, &conn_param);
+	if (rc) {
+		log_rdma_event("rdma_connect() failed with %i\n", rc);
+		goto out2;
+	}
+
+	if (info->connect_state != RDMA_CM_EVENT_ESTABLISHED)
+		goto out2;
+
+	log_rdma_event("rdma_connect connected\n");
+out2:
+	rdma_destroy_id(info->id);
+
+out1:
+	kfree(info);
+	return NULL;
+}
diff --git a/fs/cifs/cifsrdma.h b/fs/cifs/cifsrdma.h
index 9979fd4..71ea380 100644
--- a/fs/cifs/cifsrdma.h
+++ b/fs/cifs/cifsrdma.h
@@ -36,6 +36,16 @@ 
 struct cifs_rdma_info {
 	struct TCP_Server_Info *server_info;
 
+	// RDMA related
+	struct rdma_cm_id *id;
+	struct ib_qp_init_attr qp_attr;
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	struct ib_device_attr dev_attr;
+	int connect_state;
+	int ri_rc;
+	struct completion ri_done;
+
 	//connection paramters
 	int receive_credit_max;
 	int send_credit_target;
@@ -55,4 +65,8 @@  struct cifs_rdma_info {
 	unsigned int count_put_receive_buffer;
 	unsigned int count_send_empty;
 };
+
+// Create a SMBDirect session
+struct cifs_rdma_info* cifs_create_rdma_session(
+	struct TCP_Server_Info *server, struct sockaddr *dstaddr);
 #endif