diff mbox

[v8,14/16] CIFS: SMBD: Upper layer performs SMB read via RDMA write through memory registration

Message ID 20171123003849.17093-15-longli@exchange.microsoft.com (mailing list archive)
State New, archived
Headers show

Commit Message

Long Li Nov. 23, 2017, 12:38 a.m. UTC
From: Long Li <longli@microsoft.com>

If I/O size is larger than rdma_readwrite_threshold, use RDMA write for
SMB read by specifying channel SMB2_CHANNEL_RDMA_V1 or
SMB2_CHANNEL_RDMA_V1_INVALIDATE in the SMB packet, depending on SMB dialect
used. Append a smbd_buffer_descriptor_v1 to the end of the SMB packet and fill
in other values to indicate this SMB read uses RDMA write.

There is no need to read from the transport for incoming payload. At the time
SMB read response comes back, the data is already transfered and placed in the
pages by RDMA hardware.

When SMB read is finished, deregister the memory regions if RDMA write is used
for this SMB read. smbd_deregister_mr may need to do local invalidation and
sleep, if server remote invalidation is not used.

There are situations where the MID may not be created on I/O failure, under
which memory region is deregistered when read data context is released.

Signed-off-by: Long Li <longli@microsoft.com>
---
 fs/cifs/file.c    | 17 +++++++++++++++--
 fs/cifs/smb2pdu.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 59 insertions(+), 3 deletions(-)

Comments

Steve French Jan. 3, 2018, 9:31 p.m. UTC | #1
Fixed minor typo ("transferred") before merging

On Wed, Nov 22, 2017 at 6:38 PM, Long Li <longli@exchange.microsoft.com> wrote:
> From: Long Li <longli@microsoft.com>
>
> If I/O size is larger than rdma_readwrite_threshold, use RDMA write for
> SMB read by specifying channel SMB2_CHANNEL_RDMA_V1 or
> SMB2_CHANNEL_RDMA_V1_INVALIDATE in the SMB packet, depending on SMB dialect
> used. Append a smbd_buffer_descriptor_v1 to the end of the SMB packet and fill
> in other values to indicate this SMB read uses RDMA write.
>
> There is no need to read from the transport for incoming payload. At the time
> SMB read response comes back, the data is already transfered and placed in the
> pages by RDMA hardware.
>
> When SMB read is finished, deregister the memory regions if RDMA write is used
> for this SMB read. smbd_deregister_mr may need to do local invalidation and
> sleep, if server remote invalidation is not used.
>
> There are situations where the MID may not be created on I/O failure, under
> which memory region is deregistered when read data context is released.
>
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
>  fs/cifs/file.c    | 17 +++++++++++++++--
>  fs/cifs/smb2pdu.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> index df9f682..93259a16 100644
> --- a/fs/cifs/file.c
> +++ b/fs/cifs/file.c
> @@ -42,7 +42,7 @@
>  #include "cifs_debug.h"
>  #include "cifs_fs_sb.h"
>  #include "fscache.h"
> -
> +#include "smbdirect.h"
>
>  static inline int cifs_convert_flags(unsigned int flags)
>  {
> @@ -2902,7 +2902,12 @@ cifs_readdata_release(struct kref *refcount)
>  {
>         struct cifs_readdata *rdata = container_of(refcount,
>                                         struct cifs_readdata, refcount);
> -
> +#ifdef CONFIG_CIFS_SMB_DIRECT
> +       if (rdata->mr) {
> +               smbd_deregister_mr(rdata->mr);
> +               rdata->mr = NULL;
> +       }
> +#endif
>         if (rdata->cfile)
>                 cifsFileInfo_put(rdata->cfile);
>
> @@ -3031,6 +3036,10 @@ uncached_fill_pages(struct TCP_Server_Info *server,
>                 }
>                 if (iter)
>                         result = copy_page_from_iter(page, 0, n, iter);
> +#ifdef CONFIG_CIFS_SMB_DIRECT
> +               else if (rdata->mr)
> +                       result = n;
> +#endif
>                 else
>                         result = cifs_read_page_from_socket(server, page, n);
>                 if (result < 0)
> @@ -3600,6 +3609,10 @@ readpages_fill_pages(struct TCP_Server_Info *server,
>
>                 if (iter)
>                         result = copy_page_from_iter(page, 0, n, iter);
> +#ifdef CONFIG_CIFS_SMB_DIRECT
> +               else if (rdata->mr)
> +                       result = n;
> +#endif
>                 else
>                         result = cifs_read_page_from_socket(server, page, n);
>                 if (result < 0)
> diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
> index 908d777..bee0871d 100644
> --- a/fs/cifs/smb2pdu.c
> +++ b/fs/cifs/smb2pdu.c
> @@ -2458,7 +2458,40 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
>         req->MinimumCount = 0;
>         req->Length = cpu_to_le32(io_parms->length);
>         req->Offset = cpu_to_le64(io_parms->offset);
> +#ifdef CONFIG_CIFS_SMB_DIRECT
> +       /*
> +        * If we want to do a RDMA write, fill in and append
> +        * smbd_buffer_descriptor_v1 to the end of read request
> +        */
> +       if (server->rdma && rdata &&
> +               rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) {
> +
> +               struct smbd_buffer_descriptor_v1 *v1;
> +               bool need_invalidate =
> +                       io_parms->tcon->ses->server->dialect == SMB30_PROT_ID;
> +
> +               rdata->mr = smbd_register_mr(
> +                               server->smbd_conn, rdata->pages,
> +                               rdata->nr_pages, rdata->tailsz,
> +                               true, need_invalidate);
> +               if (!rdata->mr)
> +                       return -ENOBUFS;
> +
> +               req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
> +               if (need_invalidate)
> +                       req->Channel = SMB2_CHANNEL_RDMA_V1;
> +               req->ReadChannelInfoOffset =
> +                       offsetof(struct smb2_read_plain_req, Buffer);
> +               req->ReadChannelInfoLength =
> +                       sizeof(struct smbd_buffer_descriptor_v1);
> +               v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
> +               v1->offset = rdata->mr->mr->iova;
> +               v1->token = rdata->mr->mr->rkey;
> +               v1->length = rdata->mr->mr->length;
>
> +               *total_len += sizeof(*v1) - 1;
> +       }
> +#endif
>         if (request_type & CHAINED_REQUEST) {
>                 if (!(request_type & END_OF_CHAIN)) {
>                         /* next 8-byte aligned request */
> @@ -2537,7 +2570,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
>                 if (rdata->result != -ENODATA)
>                         rdata->result = -EIO;
>         }
> -
> +#ifdef CONFIG_CIFS_SMB_DIRECT
> +       /*
> +        * If this rdata has a memmory registered, the MR can be freed
> +        * MR needs to be freed as soon as I/O finishes to prevent deadlock
> +        * because they have limited number and are used for future I/Os
> +        */
> +       if (rdata->mr) {
> +               smbd_deregister_mr(rdata->mr);
> +               rdata->mr = NULL;
> +       }
> +#endif
>         if (rdata->result)
>                 cifs_stats_fail_inc(tcon, SMB2_READ_HE);
>
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-cifs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index df9f682..93259a16 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -42,7 +42,7 @@ 
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
 #include "fscache.h"
-
+#include "smbdirect.h"
 
 static inline int cifs_convert_flags(unsigned int flags)
 {
@@ -2902,7 +2902,12 @@  cifs_readdata_release(struct kref *refcount)
 {
 	struct cifs_readdata *rdata = container_of(refcount,
 					struct cifs_readdata, refcount);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+	if (rdata->mr) {
+		smbd_deregister_mr(rdata->mr);
+		rdata->mr = NULL;
+	}
+#endif
 	if (rdata->cfile)
 		cifsFileInfo_put(rdata->cfile);
 
@@ -3031,6 +3036,10 @@  uncached_fill_pages(struct TCP_Server_Info *server,
 		}
 		if (iter)
 			result = copy_page_from_iter(page, 0, n, iter);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+		else if (rdata->mr)
+			result = n;
+#endif
 		else
 			result = cifs_read_page_from_socket(server, page, n);
 		if (result < 0)
@@ -3600,6 +3609,10 @@  readpages_fill_pages(struct TCP_Server_Info *server,
 
 		if (iter)
 			result = copy_page_from_iter(page, 0, n, iter);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+		else if (rdata->mr)
+			result = n;
+#endif
 		else
 			result = cifs_read_page_from_socket(server, page, n);
 		if (result < 0)
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 908d777..bee0871d 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2458,7 +2458,40 @@  smb2_new_read_req(void **buf, unsigned int *total_len,
 	req->MinimumCount = 0;
 	req->Length = cpu_to_le32(io_parms->length);
 	req->Offset = cpu_to_le64(io_parms->offset);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+	/*
+	 * If we want to do a RDMA write, fill in and append
+	 * smbd_buffer_descriptor_v1 to the end of read request
+	 */
+	if (server->rdma && rdata &&
+		rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) {
+
+		struct smbd_buffer_descriptor_v1 *v1;
+		bool need_invalidate =
+			io_parms->tcon->ses->server->dialect == SMB30_PROT_ID;
+
+		rdata->mr = smbd_register_mr(
+				server->smbd_conn, rdata->pages,
+				rdata->nr_pages, rdata->tailsz,
+				true, need_invalidate);
+		if (!rdata->mr)
+			return -ENOBUFS;
+
+		req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
+		if (need_invalidate)
+			req->Channel = SMB2_CHANNEL_RDMA_V1;
+		req->ReadChannelInfoOffset =
+			offsetof(struct smb2_read_plain_req, Buffer);
+		req->ReadChannelInfoLength =
+			sizeof(struct smbd_buffer_descriptor_v1);
+		v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
+		v1->offset = rdata->mr->mr->iova;
+		v1->token = rdata->mr->mr->rkey;
+		v1->length = rdata->mr->mr->length;
 
+		*total_len += sizeof(*v1) - 1;
+	}
+#endif
 	if (request_type & CHAINED_REQUEST) {
 		if (!(request_type & END_OF_CHAIN)) {
 			/* next 8-byte aligned request */
@@ -2537,7 +2570,17 @@  smb2_readv_callback(struct mid_q_entry *mid)
 		if (rdata->result != -ENODATA)
 			rdata->result = -EIO;
 	}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+	/*
+	 * If this rdata has a memmory registered, the MR can be freed
+	 * MR needs to be freed as soon as I/O finishes to prevent deadlock
+	 * because they have limited number and are used for future I/Os
+	 */
+	if (rdata->mr) {
+		smbd_deregister_mr(rdata->mr);
+		rdata->mr = NULL;
+	}
+#endif
 	if (rdata->result)
 		cifs_stats_fail_inc(tcon, SMB2_READ_HE);