diff mbox

[libibverbs,2/3] Add Memory Windows support

Message ID 1423134637-14438-3-git-send-email-majd@mellanox.com (mailing list archive)
State Rejected
Headers show

Commit Message

Majd Dibbiny Feb. 5, 2015, 11:10 a.m. UTC
From: Majd Dibbiny <majd@mellanox.com>

Memory Windows(MW) is a method to raise the remote privileges of memory range
within a Memory Region(MR).

A MW is allocated using ibv_alloc_mw, and for it to be useful it should be
bound to a MR using ibv_bind_mw.

The bind operation generates a new R_key with the new permissions for the MW.
The advantages of MWs is the light weight generation of R_key with changing
permissions.

MW type 1's R_key can be invalidated by binding the MW to a MR where the length
of the MW is zero.

A local MW type 2's R_key can be invalidated by sending a work request(WR),
where the immediate data contains the MW's R_key and the opcode is
IBV_WR_LOCAL_INV.

When done, the user can unbind and deallocate the MW using ibv_dealloc_mw.

Add the following verbs:
1. ibv_alloc_mw: Takes a Protection Domain(PD) and type of MW, and return
		 a MW.
2. ibv_bind_mw: Takes a Queue Pair(QP), a type 1 MW and bind information (MR,
		address, length, access flags). Then it posts a bind request
		to the given QP. Upon success, the MW's R_key is updated.
		For type 2 MW, one should directly post bind WQE to the QP,
		using ibv_post_send.
3. ibv_dealloc_mw: Unbinds and deallocates the MW.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
---
 Makefile.am                   |    7 ++-
 include/infiniband/driver.h   |    6 +++
 include/infiniband/kern-abi.h |   23 ++++++++++
 include/infiniband/verbs.h    |   75 ++++++++++++++++++++++++++++++---
 man/ibv_alloc_mw.3            |   49 ++++++++++++++++++++++
 man/ibv_bind_mw.3             |   91 +++++++++++++++++++++++++++++++++++++++++
 man/ibv_inc_rkey.3            |   29 +++++++++++++
 man/ibv_post_send.3           |   22 ++++++++++
 src/cmd.c                     |   37 +++++++++++++++++
 src/libibverbs.map            |    3 +
 10 files changed, 333 insertions(+), 9 deletions(-)
 create mode 100644 man/ibv_alloc_mw.3
 create mode 100644 man/ibv_bind_mw.3
 create mode 100644 man/ibv_inc_rkey.3
diff mbox

Patch

diff --git a/Makefile.am b/Makefile.am
index a6767de..5ae1dab 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -58,7 +58,8 @@  man_MANS = man/ibv_asyncwatch.1 man/ibv_devices.1 man/ibv_devinfo.1	\
     man/ibv_query_srq.3 man/ibv_rate_to_mult.3 man/ibv_reg_mr.3		\
     man/ibv_req_notify_cq.3 man/ibv_resize_cq.3 man/ibv_rate_to_mbps.3  \
     man/ibv_create_qp_ex.3 man/ibv_create_srq_ex.3 man/ibv_open_xrcd.3  \
-    man/ibv_get_srq_num.3 man/ibv_open_qp.3
+    man/ibv_get_srq_num.3 man/ibv_open_qp.3 man/ibv_alloc_mw.3		\
+    man/ibv_bind_mw.3 man/ibv_inc_rkey.3
 
 DEBIAN = debian/changelog debian/compat debian/control debian/copyright \
     debian/ibverbs-utils.install debian/libibverbs1.install \
@@ -94,6 +95,7 @@  install-data-hook:
 	$(RM) ibv_port_state_str.3 && \
 	$(RM) mbps_to_ibv_rate.3 && \
 	$(RM) ibv_close_xrcd.3 && \
+	$(RM) ibv_dealloc_mw.3 && \
 	$(LN_S) ibv_get_async_event.3 ibv_ack_async_event.3 && \
 	$(LN_S) ibv_get_cq_event.3 ibv_ack_cq_events.3 && \
 	$(LN_S) ibv_open_device.3 ibv_close_device.3 && \
@@ -111,4 +113,5 @@  install-data-hook:
 	$(LN_S) ibv_event_type_str.3 ibv_node_type_str.3 && \
 	$(LN_S) ibv_event_type_str.3 ibv_port_state_str.3 && \
 	$(LN_S) ibv_rate_to_mbps.3 mbps_to_ibv_rate.3 && \
-	$(LN_S) ibv_open_xrcd.3 ibv_close_xrcd.3
+	$(LN_S) ibv_open_xrcd.3 ibv_close_xrcd.3 && \
+	$(LN_S) ibv_alloc_mw.3 ibv_dealloc_mw.3
diff --git a/include/infiniband/driver.h b/include/infiniband/driver.h
index 5cc092b..e3b7401 100644
--- a/include/infiniband/driver.h
+++ b/include/infiniband/driver.h
@@ -129,6 +129,12 @@  int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
 		   size_t cmd_size,
 		   struct ibv_reg_mr_resp *resp, size_t resp_size);
 int ibv_cmd_dereg_mr(struct ibv_mr *mr);
+int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type,
+		     struct ibv_mw *mw, struct ibv_alloc_mw *cmd,
+		     size_t cmd_size,
+		     struct ibv_alloc_mw_resp *resp, size_t resp_size);
+int ibv_cmd_dealloc_mw(struct ibv_mw *mw,
+		       struct ibv_dealloc_mw *cmd, size_t cmd_size);
 int ibv_cmd_create_cq(struct ibv_context *context, int cqe,
 		      struct ibv_comp_channel *channel,
 		      int comp_vector, struct ibv_cq *cq,
diff --git a/include/infiniband/kern-abi.h b/include/infiniband/kern-abi.h
index 91b45d8..ceb2ca9 100644
--- a/include/infiniband/kern-abi.h
+++ b/include/infiniband/kern-abi.h
@@ -340,6 +340,29 @@  struct ibv_dereg_mr {
 	__u32 mr_handle;
 };
 
+struct ibv_alloc_mw {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 pd_handle;
+	__u8  mw_type;
+	__u8  reserved[3];
+};
+
+struct ibv_alloc_mw_resp {
+	__u32 mw_handle;
+	__u32 rkey;
+};
+
+struct ibv_dealloc_mw {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u32 mw_handle;
+	__u32 reserved;
+};
+
 struct ibv_create_comp_channel {
 	__u32 command;
 	__u16 in_words;
diff --git a/include/infiniband/verbs.h b/include/infiniband/verbs.h
index cfa1156..dcee050 100644
--- a/include/infiniband/verbs.h
+++ b/include/infiniband/verbs.h
@@ -115,8 +115,11 @@  enum ibv_device_cap_flags {
 	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
 	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
 	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
+	IBV_DEVICE_MEM_WINDOW           = 1 << 17,
 	IBV_DEVICE_XRC			= 1 << 20,
-	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
+	IBV_DEVICE_MEM_WINDOW_TYPE_2A   = 1 << 23,
+	IBV_DEVICE_MEM_WINDOW_TYPE_2B   = 1 << 24,
+	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29,
 };
 
 enum ibv_atomic_cap {
@@ -280,6 +283,7 @@  enum ibv_wc_opcode {
 	IBV_WC_COMP_SWAP,
 	IBV_WC_FETCH_ADD,
 	IBV_WC_BIND_MW,
+	IBV_WC_LOCAL_INV,
 /*
  * Set value of IBV_WC_RECV so consumers can test if a completion is a
  * receive by testing (opcode & IBV_WC_RECV).
@@ -314,7 +318,15 @@  enum ibv_access_flags {
 	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
 	IBV_ACCESS_REMOTE_READ		= (1<<2),
 	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
-	IBV_ACCESS_MW_BIND		= (1<<4)
+	IBV_ACCESS_MW_BIND		= (1<<4),
+	IBV_ACCESS_ZERO_BASED		= (1<<5)
+};
+
+struct ibv_mw_bind_info {
+	struct ibv_mr	*mr;
+	uint64_t	 addr;
+	uint64_t	 length;
+	uint64_t	 mw_access_flags; /* use ibv_access_flags */
 };
 
 struct ibv_pd {
@@ -364,6 +376,8 @@  struct ibv_mw {
 	struct ibv_context     *context;
 	struct ibv_pd	       *pd;
 	uint32_t		rkey;
+	uint32_t		handle;
+	enum ibv_mw_type	type;
 };
 
 struct ibv_global_route {
@@ -620,7 +634,9 @@  enum ibv_wr_opcode {
 	IBV_WR_SEND_WITH_IMM,
 	IBV_WR_RDMA_READ,
 	IBV_WR_ATOMIC_CMP_AND_SWP,
-	IBV_WR_ATOMIC_FETCH_AND_ADD
+	IBV_WR_ATOMIC_FETCH_AND_ADD,
+	IBV_WR_LOCAL_INV,
+	IBV_WR_BIND_MW
 };
 
 enum ibv_send_flags {
@@ -666,6 +682,11 @@  struct ibv_send_wr {
 			uint32_t    remote_srqn;
 		} xrc;
 	} qp_type;
+	struct {
+		struct ibv_mw		*mw;
+		uint32_t		rkey;
+		struct ibv_mw_bind_info	bind_info;
+	} bind_mw;
 };
 
 struct ibv_recv_wr {
@@ -677,11 +698,8 @@  struct ibv_recv_wr {
 
 struct ibv_mw_bind {
 	uint64_t		wr_id;
-	struct ibv_mr	       *mr;
-	void		       *addr;
-	size_t			length;
 	int			send_flags;
-	int			mw_access_flags;
+	struct ibv_mw_bind_info bind_info;
 };
 
 struct ibv_srq {
@@ -1167,6 +1185,49 @@  int ibv_dereg_mr(struct ibv_mr *mr);
 struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context);
 
 /**
+ * ibv_alloc_mw - Allocate a memory window
+ */
+static inline struct ibv_mw *ibv_alloc_mw(struct ibv_pd *pd,
+					  enum ibv_mw_type type)
+{
+	if (!pd->context->ops.alloc_mw) {
+		errno = ENOSYS;
+		return NULL;
+	}
+
+	struct ibv_mw *mw = pd->context->ops.alloc_mw(pd, type);
+
+	return mw;
+}
+
+/**
+ * ibv_dealloc_mw - Free a memory window
+ */
+static inline int ibv_dealloc_mw(struct ibv_mw *mw)
+{
+	return mw->context->ops.dealloc_mw(mw);
+}
+
+/**
+ * ibv_inc_rkey - increase the 8 lsb in the given rkey
+ */
+static inline uint32_t ibv_inc_rkey(uint32_t rkey)
+{
+	const uint32_t mask = 0x000000ff;
+	uint8_t newtag = (uint8_t) ((rkey + 1) & mask);
+	return (rkey & ~mask) | newtag;
+}
+
+/**
+ * ibv_bind_mw - Bind a memory window to a region
+ */
+static inline int ibv_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+			      struct ibv_mw_bind *mw_bind)
+{
+	return mw->context->ops.bind_mw(qp, mw, mw_bind);
+}
+
+/**
  * ibv_destroy_comp_channel - Destroy a completion event channel
  */
 int ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
diff --git a/man/ibv_alloc_mw.3 b/man/ibv_alloc_mw.3
new file mode 100644
index 0000000..5da9d69
--- /dev/null
+++ b/man/ibv_alloc_mw.3
@@ -0,0 +1,49 @@ 
+.\" -*- nroff -*-
+.\"
+.TH IBV_ALLOC_MW 3 2015-01-27 libibverbs "Libibverbs Programmer's Manual"
+.SH "NAME"
+ibv_alloc_mw, ibv_dealloc_mw \- allocate or deallocate a memory window (MW)
+.SH "SYNOPSIS"
+.nf
+.B #include <infiniband/verbs.h>
+.sp
+.BI "struct ibv_mw *ibv_alloc_mw(struct ibv_pd " "*pd" ,
+.BI "                            enum ibv_mw_type " "type");
+.sp
+.BI "int ibv_dealloc_mw(struct ibv_mw " "*mw" );
+.fi
+.SH "DESCRIPTION"
+.B ibv_alloc_mw()
+allocates a memory window (MW) associated with the protection domain
+.I pd\fR.
+The MW's type (1 or 2A/2B) is
+.I type\fR.
+.PP
+The MW is created not bound. For it to be useful, the MW must be bound, through either ibv_bind_mw (type 1) or a special WQE (type 2). Once bound, the memory window allows RDMA (remote) access to a subset of the MR to which it was bound, until invalidated/unbound/deallocated.
+.PP
+.B ibv_dealloc_mw()
+Unbinds and deallocates the MW
+.I mw\fR.
+.SH "RETURN VALUE"
+.B ibv_alloc_mw()
+returns a pointer to the registered MW, or NULL if the request fails.
+The remote key (\fBR_Key\fR)
+field
+.B rkey
+is used by remote processes to perform Atomic and RDMA operations. This key will be changed during bind operations. The remote process places this
+.B rkey
+as the rkey field of struct ibv_send_wr passed to the ibv_post_send function.
+.PP
+.B ibv_dealloc_mw()
+returns 0 on success, or the value of errno on failure (which indicates the failure reason).
+.SH "NOTES"
+.B ibv_dereg_mr()
+fails if any memory window is still bound to this MR.
+.SH "SEE ALSO"
+.BR ibv_alloc_pd (3),
+.BR ibv_post_send (3),
+.BR ibv_bind_mw (3),
+.BR ibv_reg_mr (3),
+.SH "AUTHORS"
+.TP
+Majd Dibbiny <majd@mellanox.com>
diff --git a/man/ibv_bind_mw.3 b/man/ibv_bind_mw.3
new file mode 100644
index 0000000..54e7bcb
--- /dev/null
+++ b/man/ibv_bind_mw.3
@@ -0,0 +1,91 @@ 
+.\" -*- nroff -*-
+.\"
+.TH IBV_BIND_MW 3 2015-01-27 libibverbs "Libibverbs Programmer's Manual"
+.SH "NAME"
+ibv_bind_mw \- post a request to bind a type 1 memory window to a memory region
+.SH "SYNOPSIS"
+.nf
+.B #include <infiniband/verbs.h>
+.sp
+.BI "int ibv_bind_mw(struct ibv_qp " "*qp" ", struct ibv_mw " "*mw" ",
+.BI "                struct ibv_mw_bind " "*mw_bind" ");
+.fi
+.SH "DESCRIPTION"
+.B ibv_bind_mw()
+posts to the queue pair
+.I qp
+a request to bind the memory window
+.I mw
+according to the details in
+.I mw_bind\fR.
+.PP
+The argument
+.I mw_bind
+is an ibv_mw_bind struct, as defined in <infiniband/verbs.h>.
+.PP
+.nf
+struct ibv_mw_bind {
+.in +8
+uint64_t                     wr_id;           /* User defined WR ID */
+int                          send_flags;      /* Use ibv_send_flags */
+struct ibv_mw_bind_info      bind_info;       /* MW bind information */
+.in -8
+}
+.fi
+.PP
+.nf
+struct ibv_mw_bind_info {
+.in +8
+struct ibv_mr                *mr;             /* The MR to bind the MW to */
+void                         *addr;           /* The address the MW should start at */
+uint64_t                     length;          /* The length (in byte) the MW should span */
+uint64_t                     mw_access_flags; /* Access flags to the MW. use ibv_access_flags */
+.in -8
+};
+.fi
+.PP
+The QP Transport Service Type must be either UC or RC for bind operations.
+.PP
+The attribute send_flags describes the properties of the \s-1WR\s0. It is either 0 or the bitwise \s-1OR\s0 of one or more of the following flags:
+.PP
+.TP
+.B IBV_SEND_FENCE \fR Set the fence indicator.  Valid only for QPs with Transport Service Type \fBIBV_QPT_RC
+.TP
+.B IBV_SEND_SIGNALED \fR Set the completion notification indicator.  Relevant only if QP was created with sq_sig_all=0
+.TP
+.B IBV_SEND_SOLICITED \fR Set the solicited event indicator.  Valid only for Send and RDMA Write with immediate
+.PP
+The mw_access_flags define the allowed access to the MW after the bind
+completes successfully. It is either 0 or the bitwise \s-1OR\s0 of one
+or more of the following flags:
+.TP
+.B IBV_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access. Requires local write access to the MR.
+.TP
+.B IBV_ACCESS_REMOTE_READ\fR   Enable Remote Read Access
+.TP
+.B IBV_ACCESS_REMOTE_ATOMIC\fR Enable Remote Atomic Operation Access (if supported). Requires local write access to the MR.
+.TP
+.B IBV_ACCESS_ZERO_BASED\fR If set, the address given in post send is offset from the MW's start address.
+.SH "RETURN VALUE"
+.B ibv_bind_mw()
+returns 0 on success, or the value of errno on failure (which
+indicates the failure reason).  In case of a success, the R_Key of the
+memory window after the bind is returned in the mw_bind->mw->rkey field.
+.SH "NOTES"
+The bind does not complete when the function return - it is merely
+posted to the QP. The user should keep a copy of the old R_Key, and
+fix the mw structure if the subsequent CQE for the bind operation
+indicates a failure. The user may safely send the R_Key using a send
+request on the same QP, but must not transfer it to the remote in any
+other manner before reading a successful CQE.
+.PP
+Note that for type 2 MW, one should directly post bind WQE to the QP,
+using ibv_post_send.
+.SH "SEE ALSO"
+.BR ibv_alloc_mw (3),
+.BR ibv_post_send (3),
+.BR ibv_poll_cq (3)
+.BR ibv_reg_mr (3),
+.SH "AUTHORS"
+.TP
+Majd Dibbiny <majd@mellanox.com>
diff --git a/man/ibv_inc_rkey.3 b/man/ibv_inc_rkey.3
new file mode 100644
index 0000000..9864179
--- /dev/null
+++ b/man/ibv_inc_rkey.3
@@ -0,0 +1,29 @@ 
+.\" -*- nroff -*-
+.\"
+.TH IBV_INC_RKEY 3 2015-01-29 libibverbs "Libibverbs Programmer's Manual"
+.SH "NAME"
+.nf
+ibv_inc_rkey \- creates a new rkey from the given one
+.SH "SYNOPSIS"
+.nf
+.B #include <infiniband/verbs.h>
+.sp
+.BI "uint32_t ibv_inc_rkey(uint32_t " "rkey" ");
+.fi
+.SH "DESCRIPTION"
+.B ibv_inc_rkey()
+Increases the 8 LSB of
+.I rkey
+and returns the new value.
+.PP
+.SH "RETURN VALUE"
+.B ibv_inc_rkey()
+returns the new rkey.
+.SH "NOTES"
+.PP
+A use case for this verb can be to create a new rkey from a Memory window's rkey
+when binding it to a Memory region.
+.SH "SEE ALSO"
+.SH "AUTHORS"
+.TP
+Majd Dibbiny <majd@mellanox.com>
diff --git a/man/ibv_post_send.3 b/man/ibv_post_send.3
index 9571575..0d599ad 100644
--- a/man/ibv_post_send.3
+++ b/man/ibv_post_send.3
@@ -69,6 +69,24 @@  uint32_t remote_srqn;            /* Number of the remote SRQ */
 } xrc;
 .in -8
 } qp_type;
+struct {
+.in +8
+struct ibv_mw            *mw;             /* Memory window (MW) of type 2 to bind */
+uint32_t                 rkey;            /* The desired new rkey of the MW */
+struct ibv_mw_bind_info  bind_info;       /* MW additional bind information */
+.in -8
+} bind_mw;
+.in -8
+};
+.fi
+.sp
+.nf
+struct ibv_mw_bind_info {
+.in +8
+struct ibv_mr            *mr;             /* The Memory region (MR) to bind the MW to*/
+void                     *addr;           /* The address the MW should start at */
+size_t                   length;          /* The length (in byte) the MW should span */
+int                      mw_access_flags; /* Acess flags to the MW */
 .in -8
 };
 .fi
@@ -95,6 +113,8 @@  IBV_WR_RDMA_WRITE_WITH_IMM  |            |     X      |     X
 IBV_WR_RDMA_READ            |            |            |     X
 IBV_WR_ATOMIC_CMP_AND_SWP   |            |            |     X
 IBV_WR_ATOMIC_FETCH_AND_ADD |            |            |     X
+IBV_WR_LOCAL_INV            |            |     X      |     X
+IBV_WR_BIND_MW              |            |     X      |     X
 .fi
 .PP
 The attribute send_flags describes the properties of the \s-1WR\s0. It is either 0 or the bitwise \s-1OR\s0 of one or more of the following flags:
@@ -131,3 +151,5 @@  after the call returns.
 .SH "AUTHORS"
 .TP
 Dotan Barak <dotanba@gmail.com>
+.TP
+Majd Dibbiny <majd@mellanox.com>
diff --git a/src/cmd.c b/src/cmd.c
index 45ea06f..4230d0f 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -280,6 +280,43 @@  int ibv_cmd_dereg_mr(struct ibv_mr *mr)
 	return 0;
 }
 
+int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type,
+		     struct ibv_mw *mw, struct ibv_alloc_mw *cmd,
+		     size_t cmd_size,
+		     struct ibv_alloc_mw_resp *resp, size_t resp_size)
+{
+	IBV_INIT_CMD_RESP(cmd, cmd_size, ALLOC_MW, resp, resp_size);
+	cmd->pd_handle	= pd->handle;
+	cmd->mw_type	= type;
+	memset(cmd->reserved, 0, sizeof(cmd->reserved));
+
+	if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size)
+		return errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+
+	mw->context = pd->context;
+	mw->pd      = pd;
+	mw->rkey    = resp->rkey;
+	mw->handle  = resp->mw_handle;
+	mw->type    = type;
+
+	return 0;
+}
+
+int ibv_cmd_dealloc_mw(struct ibv_mw *mw,
+		       struct ibv_dealloc_mw *cmd, size_t cmd_size)
+{
+	IBV_INIT_CMD(cmd, cmd_size, DEALLOC_MW);
+	cmd->mw_handle = mw->handle;
+	cmd->reserved = 0;
+
+	if (write(mw->context->cmd_fd, cmd, cmd_size) != cmd_size)
+		return errno;
+
+	return 0;
+}
+
 int ibv_cmd_create_cq(struct ibv_context *context, int cqe,
 		      struct ibv_comp_channel *channel,
 		      int comp_vector, struct ibv_cq *cq,
diff --git a/src/libibverbs.map b/src/libibverbs.map
index 30212f3..bbb2259 100644
--- a/src/libibverbs.map
+++ b/src/libibverbs.map
@@ -100,6 +100,9 @@  IBVERBS_1.1 {
 		ibv_event_type_str;
 		ibv_wc_status_str;
 
+		ibv_cmd_alloc_mw;
+		ibv_cmd_dealloc_mw;
+
 		ibv_rate_to_mbps;
 		mbps_to_ibv_rate;