diff mbox series

[RFC] verbs: Introduce a new reg_mr API for virtual address space

Message ID 20190526080224.2778-1-yuval.shaia@oracle.com (mailing list archive)
State Superseded
Headers show
Series [RFC] verbs: Introduce a new reg_mr API for virtual address space | expand

Commit Message

Yuval Shaia May 26, 2019, 8:02 a.m. UTC
The virtual address that is registered is used as a base for any address
used later in post_recv and post_send operations.

On a virtualised environment this is not correct.

A guest cannot register its memory so hypervisor maps the guest physical
address to a host virtual address and register it with the HW. Later on,
at datapath phase, the guest fills the SGEs with addresses from its
address space.
Since HW cannot access guest virtual address space an extra translation
is needed map those addresses to be based on the host virtual address
that was registered with the HW.

To avoid this, a logical separation between the address that is
registered and the address that is used as a offset at datapath phase is
needed.
This separation is already implemented in the lower layer part
(ibv_cmd_reg_mr) but blocked at the API level.

Fix it by introducing a new API function that accepts a address from
guest virtual address space as well.

Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
---
 libibverbs/driver.h    |  3 +++
 libibverbs/dummy_ops.c | 10 ++++++++++
 libibverbs/verbs.h     | 26 ++++++++++++++++++++++++++
 providers/rxe/rxe.c    | 16 ++++++++++++----
 4 files changed, 51 insertions(+), 4 deletions(-)

Comments

Jason Gunthorpe May 27, 2019, 12:11 p.m. UTC | #1
On Sun, May 26, 2019 at 11:02:24AM +0300, Yuval Shaia wrote:
> The virtual address that is registered is used as a base for any address
> used later in post_recv and post_send operations.
> 
> On a virtualised environment this is not correct.
> 
> A guest cannot register its memory so hypervisor maps the guest physical
> address to a host virtual address and register it with the HW. Later on,
> at datapath phase, the guest fills the SGEs with addresses from its
> address space.
> Since HW cannot access guest virtual address space an extra translation
> is needed map those addresses to be based on the host virtual address
> that was registered with the HW.
> 
> To avoid this, a logical separation between the address that is
> registered and the address that is used as a offset at datapath phase is
> needed.
> This separation is already implemented in the lower layer part
> (ibv_cmd_reg_mr) but blocked at the API level.
> 
> Fix it by introducing a new API function that accepts a address from
> guest virtual address space as well.
> 
> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
>  libibverbs/driver.h    |  3 +++
>  libibverbs/dummy_ops.c | 10 ++++++++++
>  libibverbs/verbs.h     | 26 ++++++++++++++++++++++++++
>  providers/rxe/rxe.c    | 16 ++++++++++++----
>  4 files changed, 51 insertions(+), 4 deletions(-)
> 
> diff --git a/libibverbs/driver.h b/libibverbs/driver.h
> index e4d624b2..73bc10e6 100644
> +++ b/libibverbs/driver.h
> @@ -339,6 +339,9 @@ struct verbs_context_ops {
>  				    unsigned int access);
>  	struct ibv_mr *(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
>  				 int access);
> +	struct ibv_mr *(*reg_mr_virt_as)(struct ibv_pd *pd, void *addr,
> +					 size_t length, uint64_t hca_va,
> +					 int access);

I don't want to see a new entry point, all HW already supports it, so
we should just add the hca_va to the main one and remove the
assumption that the void *addr should be used as the hca_va from the
drivers.


>  	int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
>  	int (*rereg_mr)(struct verbs_mr *vmr, int flags, struct ibv_pd *pd,
>  			void *addr, size_t length, int access);
> diff --git a/libibverbs/dummy_ops.c b/libibverbs/dummy_ops.c
> index c861c3a0..aab61a17 100644
> +++ b/libibverbs/dummy_ops.c
> @@ -416,6 +416,14 @@ static struct ibv_mr *reg_mr(struct ibv_pd *pd, void *addr, size_t length,
>  	return NULL;
>  }
>  
> +static struct ibv_mr *reg_mr_virt_as(struct ibv_pd *pd, void *addr,
> +				     size_t length, uint64_t hca_va,
> +				     int access)
> +{
> +	errno = ENOSYS;

> +	return NULL;
> +}
> +
>  static int req_notify_cq(struct ibv_cq *cq, int solicited_only)
>  {
>  	return ENOSYS;
> @@ -508,6 +516,7 @@ const struct verbs_context_ops verbs_dummy_ops = {
>  	read_counters,
>  	reg_dm_mr,
>  	reg_mr,
> +	reg_mr_virt_as,
>  	req_notify_cq,
>  	rereg_mr,
>  	resize_cq,
> @@ -623,6 +632,7 @@ void verbs_set_ops(struct verbs_context *vctx,
>  	SET_PRIV_OP(ctx, query_srq);
>  	SET_OP(vctx, reg_dm_mr);
>  	SET_PRIV_OP(ctx, reg_mr);
> +	SET_OP(vctx, reg_mr_virt_as);
>  	SET_OP(ctx, req_notify_cq);
>  	SET_PRIV_OP(ctx, rereg_mr);
>  	SET_PRIV_OP(ctx, resize_cq);
> diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h
> index cb2d8439..8bcc8388 100644
> +++ b/libibverbs/verbs.h
> @@ -2037,6 +2037,9 @@ struct verbs_context {
>  	struct ibv_mr *(*reg_dm_mr)(struct ibv_pd *pd, struct ibv_dm *dm,
>  				    uint64_t dm_offset, size_t length,
>  				    unsigned int access);
> +	struct ibv_mr *(*reg_mr_virt_as)(struct ibv_pd *pd, void *addr,
> +					 size_t length, uint64_t hca_va,
> +					 int access);

Can't add new functions here, breaks the ABI

Jason
Yuval Shaia May 27, 2019, 12:18 p.m. UTC | #2
On Mon, May 27, 2019 at 09:11:34AM -0300, Jason Gunthorpe wrote:
> On Sun, May 26, 2019 at 11:02:24AM +0300, Yuval Shaia wrote:
> > The virtual address that is registered is used as a base for any address
> > used later in post_recv and post_send operations.
> > 
> > On a virtualised environment this is not correct.
> > 
> > A guest cannot register its memory so hypervisor maps the guest physical
> > address to a host virtual address and register it with the HW. Later on,
> > at datapath phase, the guest fills the SGEs with addresses from its
> > address space.
> > Since HW cannot access guest virtual address space an extra translation
> > is needed map those addresses to be based on the host virtual address
> > that was registered with the HW.
> > 
> > To avoid this, a logical separation between the address that is
> > registered and the address that is used as a offset at datapath phase is
> > needed.
> > This separation is already implemented in the lower layer part
> > (ibv_cmd_reg_mr) but blocked at the API level.
> > 
> > Fix it by introducing a new API function that accepts a address from
> > guest virtual address space as well.
> > 
> > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> >  libibverbs/driver.h    |  3 +++
> >  libibverbs/dummy_ops.c | 10 ++++++++++
> >  libibverbs/verbs.h     | 26 ++++++++++++++++++++++++++
> >  providers/rxe/rxe.c    | 16 ++++++++++++----
> >  4 files changed, 51 insertions(+), 4 deletions(-)
> > 
> > diff --git a/libibverbs/driver.h b/libibverbs/driver.h
> > index e4d624b2..73bc10e6 100644
> > +++ b/libibverbs/driver.h
> > @@ -339,6 +339,9 @@ struct verbs_context_ops {
> >  				    unsigned int access);
> >  	struct ibv_mr *(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
> >  				 int access);
> > +	struct ibv_mr *(*reg_mr_virt_as)(struct ibv_pd *pd, void *addr,
> > +					 size_t length, uint64_t hca_va,
> > +					 int access);
> 
> I don't want to see a new entry point, all HW already supports it, so
> we should just add the hca_va to the main one and remove the
> assumption that the void *addr should be used as the hca_va from the
> drivers.

So it is better to change the reg_mr signature? That would break API, i.e.
all apps that are using reg_mr would have to be changed accordingly.
I'm a newbie here so i might be talking nonsense.

> 
> 
> >  	int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
> >  	int (*rereg_mr)(struct verbs_mr *vmr, int flags, struct ibv_pd *pd,
> >  			void *addr, size_t length, int access);
> > diff --git a/libibverbs/dummy_ops.c b/libibverbs/dummy_ops.c
> > index c861c3a0..aab61a17 100644
> > +++ b/libibverbs/dummy_ops.c
> > @@ -416,6 +416,14 @@ static struct ibv_mr *reg_mr(struct ibv_pd *pd, void *addr, size_t length,
> >  	return NULL;
> >  }
> >  
> > +static struct ibv_mr *reg_mr_virt_as(struct ibv_pd *pd, void *addr,
> > +				     size_t length, uint64_t hca_va,
> > +				     int access)
> > +{
> > +	errno = ENOSYS;
> 
> > +	return NULL;
> > +}
> > +
> >  static int req_notify_cq(struct ibv_cq *cq, int solicited_only)
> >  {
> >  	return ENOSYS;
> > @@ -508,6 +516,7 @@ const struct verbs_context_ops verbs_dummy_ops = {
> >  	read_counters,
> >  	reg_dm_mr,
> >  	reg_mr,
> > +	reg_mr_virt_as,
> >  	req_notify_cq,
> >  	rereg_mr,
> >  	resize_cq,
> > @@ -623,6 +632,7 @@ void verbs_set_ops(struct verbs_context *vctx,
> >  	SET_PRIV_OP(ctx, query_srq);
> >  	SET_OP(vctx, reg_dm_mr);
> >  	SET_PRIV_OP(ctx, reg_mr);
> > +	SET_OP(vctx, reg_mr_virt_as);
> >  	SET_OP(ctx, req_notify_cq);
> >  	SET_PRIV_OP(ctx, rereg_mr);
> >  	SET_PRIV_OP(ctx, resize_cq);
> > diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h
> > index cb2d8439..8bcc8388 100644
> > +++ b/libibverbs/verbs.h
> > @@ -2037,6 +2037,9 @@ struct verbs_context {
> >  	struct ibv_mr *(*reg_dm_mr)(struct ibv_pd *pd, struct ibv_dm *dm,
> >  				    uint64_t dm_offset, size_t length,
> >  				    unsigned int access);
> > +	struct ibv_mr *(*reg_mr_virt_as)(struct ibv_pd *pd, void *addr,
> > +					 size_t length, uint64_t hca_va,
> > +					 int access);
> 
> Can't add new functions here, breaks the ABI

My assumption was that it is better to add new function than to change an
existing function's signature.

> 
> Jason
Jason Gunthorpe May 27, 2019, 12:26 p.m. UTC | #3
On Mon, May 27, 2019 at 03:18:39PM +0300, Yuval Shaia wrote:
> On Mon, May 27, 2019 at 09:11:34AM -0300, Jason Gunthorpe wrote:
> > On Sun, May 26, 2019 at 11:02:24AM +0300, Yuval Shaia wrote:
> > > The virtual address that is registered is used as a base for any address
> > > used later in post_recv and post_send operations.
> > > 
> > > On a virtualised environment this is not correct.
> > > 
> > > A guest cannot register its memory so hypervisor maps the guest physical
> > > address to a host virtual address and register it with the HW. Later on,
> > > at datapath phase, the guest fills the SGEs with addresses from its
> > > address space.
> > > Since HW cannot access guest virtual address space an extra translation
> > > is needed map those addresses to be based on the host virtual address
> > > that was registered with the HW.
> > > 
> > > To avoid this, a logical separation between the address that is
> > > registered and the address that is used as a offset at datapath phase is
> > > needed.
> > > This separation is already implemented in the lower layer part
> > > (ibv_cmd_reg_mr) but blocked at the API level.
> > > 
> > > Fix it by introducing a new API function that accepts a address from
> > > guest virtual address space as well.
> > > 
> > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> > >  libibverbs/driver.h    |  3 +++
> > >  libibverbs/dummy_ops.c | 10 ++++++++++
> > >  libibverbs/verbs.h     | 26 ++++++++++++++++++++++++++
> > >  providers/rxe/rxe.c    | 16 ++++++++++++----
> > >  4 files changed, 51 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/libibverbs/driver.h b/libibverbs/driver.h
> > > index e4d624b2..73bc10e6 100644
> > > +++ b/libibverbs/driver.h
> > > @@ -339,6 +339,9 @@ struct verbs_context_ops {
> > >  				    unsigned int access);
> > >  	struct ibv_mr *(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
> > >  				 int access);
> > > +	struct ibv_mr *(*reg_mr_virt_as)(struct ibv_pd *pd, void *addr,
> > > +					 size_t length, uint64_t hca_va,
> > > +					 int access);
> > 
> > I don't want to see a new entry point, all HW already supports it, so
> > we should just add the hca_va to the main one and remove the
> > assumption that the void *addr should be used as the hca_va from the
> > drivers.
> 
> So it is better to change the reg_mr signature? That would break API, i.e.
> all apps that are using reg_mr would have to be changed accordingly.
> I'm a newbie here so i might be talking nonsense.

driver.h is not ABI

> > > diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h
> > > index cb2d8439..8bcc8388 100644
> > > +++ b/libibverbs/verbs.h
> > > @@ -2037,6 +2037,9 @@ struct verbs_context {
> > >  	struct ibv_mr *(*reg_dm_mr)(struct ibv_pd *pd, struct ibv_dm *dm,
> > >  				    uint64_t dm_offset, size_t length,
> > >  				    unsigned int access);
> > > +	struct ibv_mr *(*reg_mr_virt_as)(struct ibv_pd *pd, void *addr,
> > > +					 size_t length, uint64_t hca_va,
> > > +					 int access);
> > 
> > Can't add new functions here, breaks the ABI
> 
> My assumption was that it is better to add new function than to change an
> existing function's signature.

verbs.h is ABI, so you have to change it by growing the structs, not
adding things in the middle.

Probably the best thing for an API like this is to just add a new
normally linked public function instead of using function pointers
here.

Jason
diff mbox series

Patch

diff --git a/libibverbs/driver.h b/libibverbs/driver.h
index e4d624b2..73bc10e6 100644
--- a/libibverbs/driver.h
+++ b/libibverbs/driver.h
@@ -339,6 +339,9 @@  struct verbs_context_ops {
 				    unsigned int access);
 	struct ibv_mr *(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
 				 int access);
+	struct ibv_mr *(*reg_mr_virt_as)(struct ibv_pd *pd, void *addr,
+					 size_t length, uint64_t hca_va,
+					 int access);
 	int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
 	int (*rereg_mr)(struct verbs_mr *vmr, int flags, struct ibv_pd *pd,
 			void *addr, size_t length, int access);
diff --git a/libibverbs/dummy_ops.c b/libibverbs/dummy_ops.c
index c861c3a0..aab61a17 100644
--- a/libibverbs/dummy_ops.c
+++ b/libibverbs/dummy_ops.c
@@ -416,6 +416,14 @@  static struct ibv_mr *reg_mr(struct ibv_pd *pd, void *addr, size_t length,
 	return NULL;
 }
 
+static struct ibv_mr *reg_mr_virt_as(struct ibv_pd *pd, void *addr,
+				     size_t length, uint64_t hca_va,
+				     int access)
+{
+	errno = ENOSYS;
+	return NULL;
+}
+
 static int req_notify_cq(struct ibv_cq *cq, int solicited_only)
 {
 	return ENOSYS;
@@ -508,6 +516,7 @@  const struct verbs_context_ops verbs_dummy_ops = {
 	read_counters,
 	reg_dm_mr,
 	reg_mr,
+	reg_mr_virt_as,
 	req_notify_cq,
 	rereg_mr,
 	resize_cq,
@@ -623,6 +632,7 @@  void verbs_set_ops(struct verbs_context *vctx,
 	SET_PRIV_OP(ctx, query_srq);
 	SET_OP(vctx, reg_dm_mr);
 	SET_PRIV_OP(ctx, reg_mr);
+	SET_OP(vctx, reg_mr_virt_as);
 	SET_OP(ctx, req_notify_cq);
 	SET_PRIV_OP(ctx, rereg_mr);
 	SET_PRIV_OP(ctx, resize_cq);
diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h
index cb2d8439..8bcc8388 100644
--- a/libibverbs/verbs.h
+++ b/libibverbs/verbs.h
@@ -2037,6 +2037,9 @@  struct verbs_context {
 	struct ibv_mr *(*reg_dm_mr)(struct ibv_pd *pd, struct ibv_dm *dm,
 				    uint64_t dm_offset, size_t length,
 				    unsigned int access);
+	struct ibv_mr *(*reg_mr_virt_as)(struct ibv_pd *pd, void *addr,
+					 size_t length, uint64_t hca_va,
+					 int access);
 	struct ibv_dm *(*alloc_dm)(struct ibv_context *context,
 				   struct ibv_alloc_dm_attr *attr);
 	int (*free_dm)(struct ibv_dm *dm);
@@ -2574,6 +2577,29 @@  struct ibv_mr *ibv_reg_dm_mr(struct ibv_pd *pd, struct ibv_dm *dm,
 	return vctx->reg_dm_mr(pd, dm, dm_offset, length, access);
 }
 
+/**
+ * ibv_reg_mr_virt_as - Register MR for virtual address
+ * @pd - The PD to associated this MR with
+ * @addr - Address to register
+ * @length - Number of bytes to register
+ * @hca_va - Virtual Address
+ * @access - memory region access flags
+ */
+static inline
+struct ibv_mr *ibv_reg_mr_virt_as(struct ibv_pd *pd, void *addr, size_t length,
+				  uint64_t hca_va, int access)
+{
+	struct verbs_context *vctx = verbs_get_ctx_op(pd->context,
+						      reg_mr_virt_as);
+
+	if (!vctx) {
+		errno = ENOSYS;
+		return NULL;
+	}
+
+	return vctx->reg_mr_virt_as(pd, addr, length, hca_va, access);
+}
+
 /**
  * ibv_create_cq - Create a completion queue
  * @context - Context CQ will be attached to
diff --git a/providers/rxe/rxe.c b/providers/rxe/rxe.c
index 909c3f7b..e7246da9 100644
--- a/providers/rxe/rxe.c
+++ b/providers/rxe/rxe.c
@@ -122,8 +122,9 @@  static int rxe_dealloc_pd(struct ibv_pd *pd)
 	return ret;
 }
 
-static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
-				 int access)
+static struct ibv_mr *rxe_reg_mr_virt_as(struct ibv_pd *pd, void *addr,
+					 size_t length, uint64_t hca_va,
+					 int access)
 {
 	struct verbs_mr *vmr;
 	struct ibv_reg_mr cmd;
@@ -134,8 +135,8 @@  static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
 	if (!vmr)
 		return NULL;
 
-	ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t)addr, access, vmr,
-			     &cmd, sizeof cmd, &resp, sizeof resp);
+	ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd,
+			     sizeof(cmd), &resp, sizeof(resp));
 	if (ret) {
 		free(vmr);
 		return NULL;
@@ -144,6 +145,12 @@  static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
 	return &vmr->ibv_mr;
 }
 
+static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
+				 int access)
+{
+	return rxe_reg_mr_virt_as(pd, addr, length, (uint64_t) addr, access);
+}
+
 static int rxe_dereg_mr(struct verbs_mr *vmr)
 {
 	int ret;
@@ -836,6 +843,7 @@  static const struct verbs_context_ops rxe_ctx_ops = {
 	.alloc_pd = rxe_alloc_pd,
 	.dealloc_pd = rxe_dealloc_pd,
 	.reg_mr = rxe_reg_mr,
+	.reg_mr_virt_as = rxe_reg_mr_virt_as,
 	.dereg_mr = rxe_dereg_mr,
 	.create_cq = rxe_create_cq,
 	.poll_cq = rxe_poll_cq,