diff mbox series

[RFC] rdma_rxe: Stop passing AV from user space

Message ID 20201016170147.11016-1-rpearson@hpe.com (mailing list archive)
State RFC
Headers show
Series [RFC] rdma_rxe: Stop passing AV from user space | expand

Commit Message

Bob Pearson Oct. 16, 2020, 5:01 p.m. UTC
This patch shows it is possible to replace the current
method used by rdma_rxe to handle UD transport where
user space creates and then passes an rxe_av *av to the
kernel driver as part of each rxe_send_wqe.

Here user space passes the handle created by the call to
ibv_create_ah in the send wqe. The kernel driver uses that
handle to get a pointer to the 'real' rxe_av that was
created in the kernel by the create ah verb.

To do this requires executing code in the driver that mimics
what is done in rdma_core to convert handles to objects.
This is not ideal but gets the job done. It would probably be
better for rdma_core to provide a service that can do this
for software drivers that have to do this.

The alternative (used by the MW code) is to create a driver
private index and pass that back from the create verb to
user space and then have user space use that in the send wqe.

I would like to avoid replicating work already being done by
rdma core by using the handle that already exists. I don't
know the relative performance of the lookup in rdma_core and
the red black tree used by rxe currently.

There is a matching change to provider/rxe.

Signed-off-by: Bob Pearson <rpearson@hpe.com>
---

Comments

Jason Gunthorpe Oct. 19, 2020, 6:53 p.m. UTC | #1
On Fri, Oct 16, 2020 at 12:01:48PM -0500, Bob Pearson wrote:
>  
> +static struct ib_ah *get_ah_from_handle(struct rxe_qp *qp, u32 handle)
> +{
> +	struct ib_uverbs_file *ufile;
> +	struct uverbs_api *uapi;
> +	const struct uverbs_api_object *type;
> +	struct ib_uobject *uobj;
> +
> +	ufile = qp->ibqp.uobject->uevent.uobject.ufile;
> +	uapi = ufile->device->uapi;
> +	type = uapi_get_object(uapi, UVERBS_OBJECT_AH);
> +	if (IS_ERR(type))
> +		return NULL;
> +	uobj = rdma_lookup_get_uobject(type, ufile, (s64)handle,
> +				       UVERBS_LOOKUP_READ, NULL);
> +	if (IS_ERR(uobj)) {
> +		pr_warn("unable to lookup ah handle\n");
> +		return NULL;
> +	}
> +
> +	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);

It can't be put and then return the data pointer, it is a use after free:

> +	return uobj->object;

> @@ -562,11 +563,6 @@ static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
>  
>  	init_send_wr(qp, &wqe->wr, ibwr);
>  
> -	if (qp_type(qp) == IB_QPT_UD ||
> -	    qp_type(qp) == IB_QPT_SMI ||
> -	    qp_type(qp) == IB_QPT_GSI)
> -		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));

It needs some kind of negotiated compat, can't just break userspace
like this

Jason
Bob Pearson Oct. 19, 2020, 7:06 p.m. UTC | #2
On 10/19/20 1:53 PM, Jason Gunthorpe wrote:
> On Fri, Oct 16, 2020 at 12:01:48PM -0500, Bob Pearson wrote:
>>  
>> +static struct ib_ah *get_ah_from_handle(struct rxe_qp *qp, u32 handle)
>> +{
>> +	struct ib_uverbs_file *ufile;
>> +	struct uverbs_api *uapi;
>> +	const struct uverbs_api_object *type;
>> +	struct ib_uobject *uobj;
>> +
>> +	ufile = qp->ibqp.uobject->uevent.uobject.ufile;
>> +	uapi = ufile->device->uapi;
>> +	type = uapi_get_object(uapi, UVERBS_OBJECT_AH);
>> +	if (IS_ERR(type))
>> +		return NULL;
>> +	uobj = rdma_lookup_get_uobject(type, ufile, (s64)handle,
>> +				       UVERBS_LOOKUP_READ, NULL);
>> +	if (IS_ERR(uobj)) {
>> +		pr_warn("unable to lookup ah handle\n");
>> +		return NULL;
>> +	}
>> +
>> +	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
> 
> It can't be put and then return the data pointer, it is a use after free:
> 
>> +	return uobj->object;
> 
>> @@ -562,11 +563,6 @@ static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
>>  
>>  	init_send_wr(qp, &wqe->wr, ibwr);
>>  
>> -	if (qp_type(qp) == IB_QPT_UD ||
>> -	    qp_type(qp) == IB_QPT_SMI ||
>> -	    qp_type(qp) == IB_QPT_GSI)
>> -		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
> 
> It needs some kind of negotiated compat, can't just break userspace
> like this
> 
> Jason
> 

1st point. I get it. uobj->object contains the address of one of the ib_xxx verbs objects.
Normally the driver never looks at this level but presumably has a kref on that object so it makes
sense to look it up. Perhaps better would be:

	void *object;

	...

	uobj = rdma_lookup_get_uobject(...);

	object = uobj->object;

	rdma_lookup_put_uobject(...);

	return (struct ib_ah *)object;

Here the caller has created the ib_ah but has not yet destroyed it so it must hold a kref on it.


2nd point. I also get. This suggestion imagines that there will come a day when we can change the user API.
May be a rare day but must happen occasionally. The current design is just plain wrong and needs to get fixed
eventually.

Thanks for looking at this.

Bob
Jason Gunthorpe Oct. 19, 2020, 11 p.m. UTC | #3
On Mon, Oct 19, 2020 at 02:06:30PM -0500, Bob Pearson wrote:
> On 10/19/20 1:53 PM, Jason Gunthorpe wrote:
> > On Fri, Oct 16, 2020 at 12:01:48PM -0500, Bob Pearson wrote:
> >>  
> >> +static struct ib_ah *get_ah_from_handle(struct rxe_qp *qp, u32 handle)
> >> +{
> >> +	struct ib_uverbs_file *ufile;
> >> +	struct uverbs_api *uapi;
> >> +	const struct uverbs_api_object *type;
> >> +	struct ib_uobject *uobj;
> >> +
> >> +	ufile = qp->ibqp.uobject->uevent.uobject.ufile;
> >> +	uapi = ufile->device->uapi;
> >> +	type = uapi_get_object(uapi, UVERBS_OBJECT_AH);
> >> +	if (IS_ERR(type))
> >> +		return NULL;
> >> +	uobj = rdma_lookup_get_uobject(type, ufile, (s64)handle,
> >> +				       UVERBS_LOOKUP_READ, NULL);
> >> +	if (IS_ERR(uobj)) {
> >> +		pr_warn("unable to lookup ah handle\n");
> >> +		return NULL;
> >> +	}
> >> +
> >> +	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
> > 
> > It can't be put and then return the data pointer, it is a use after free:
> > 
> >> +	return uobj->object;
> > 
> >> @@ -562,11 +563,6 @@ static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
> >>  
> >>  	init_send_wr(qp, &wqe->wr, ibwr);
> >>  
> >> -	if (qp_type(qp) == IB_QPT_UD ||
> >> -	    qp_type(qp) == IB_QPT_SMI ||
> >> -	    qp_type(qp) == IB_QPT_GSI)
> >> -		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
> > 
> > It needs some kind of negotiated compat, can't just break userspace
> > like this
> > 
> > Jason
> > 
> 
> 1st point. I get it. uobj->object contains the address of one of the ib_xxx verbs objects.
> Normally the driver never looks at this level but presumably has a kref on that object so it makes
> sense to look it up. Perhaps better would be:
> 
> 	void *object;
> 
> 	...
> 
> 	uobj = rdma_lookup_get_uobject(...);
> 
> 	object = uobj->object;
> 
> 	rdma_lookup_put_uobject(...);
> 
> 	return (struct ib_ah *)object;
> 
> Here the caller has created the ib_ah but has not yet destroyed it so it must hold a kref on it.

Drivers are not supposed to keep using object after it has been
destroyed, so some kind of interlock is needed to prevent/defer
destruction here.

The uobject layer does not provide something usable to drivers

> 2nd point. I also get. This suggestion imagines that there will come
> a day when we can change the user API.  May be a rare day but must
> happen occasionally. The current design is just plain wrong and
> needs to get fixed eventually.

You can have some cap negotiation to switch the mode AH mode in the
WQEs - 'Use WQE format 2' for instance. Most of the HW drivers have
multiple WQE formats the userspace selects.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index ffe11b03724c..faa70c4f14bb 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -425,6 +425,7 @@  struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
 	uverbs_uobject_put(uobj);
 	return ERR_PTR(ret);
 }
+EXPORT_SYMBOL(rdma_lookup_get_uobject);
 
 static struct ib_uobject *
 alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
@@ -726,6 +727,7 @@  void rdma_lookup_put_uobject(struct ib_uobject *uobj,
 	/* Pairs with the kref obtained by type->lookup_get */
 	uverbs_uobject_put(uobj);
 }
+EXPORT_SYMBOL(rdma_lookup_put_uobject);
 
 void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile)
 {
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
index 38021e2c8688..db83e1cde38e 100644
--- a/drivers/infiniband/sw/rxe/rxe_av.c
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -6,6 +6,8 @@ 
 
 #include "rxe.h"
 #include "rxe_loc.h"
+#include "../../core/uverbs.h"
+#include "../../core/rdma_core.h"
 
 void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av)
 {
@@ -72,13 +74,50 @@  void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr)
 	av->network_type = rdma_gid_attr_network_type(sgid_attr);
 }
 
+static struct ib_ah *get_ah_from_handle(struct rxe_qp *qp, u32 handle)
+{
+	struct ib_uverbs_file *ufile;
+	struct uverbs_api *uapi;
+	const struct uverbs_api_object *type;
+	struct ib_uobject *uobj;
+
+	ufile = qp->ibqp.uobject->uevent.uobject.ufile;
+	uapi = ufile->device->uapi;
+	type = uapi_get_object(uapi, UVERBS_OBJECT_AH);
+	if (IS_ERR(type))
+		return NULL;
+	uobj = rdma_lookup_get_uobject(type, ufile, (s64)handle,
+				       UVERBS_LOOKUP_READ, NULL);
+	if (IS_ERR(uobj)) {
+		pr_warn("unable to lookup ah handle\n");
+		return NULL;
+	}
+
+	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
+	return uobj->object;
+}
+
 struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt)
 {
-	if (!pkt || !pkt->qp)
+	struct rxe_qp *qp = pkt->qp;
+	struct rxe_send_wqe *wqe = pkt->wqe;
+	u32 handle;
+	struct ib_ah *ibah;
+	struct rxe_ah *ah;
+
+	if (!pkt || !qp)
 		return NULL;
 
-	if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC)
-		return &pkt->qp->pri_av;
+	if (qp_type(qp) == IB_QPT_RC || qp_type(qp) == IB_QPT_UC)
+		return &qp->pri_av;
+
+	if (qp->is_user) {
+		handle = wqe->wr.wr.ud.ah_handle;
+		ibah = get_ah_from_handle(qp, handle);
+	} else {
+		ibah = wqe->wr.wr.ud.ah;
+	}
 
-	return (pkt->wqe) ? &pkt->wqe->av : NULL;
+	ah = to_rah(ibah);
+	return &ah->av;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index d4917646641a..2a887dca9a28 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -380,12 +380,16 @@  static struct sk_buff *init_req_packet(struct rxe_qp *qp,
 
 	/* init skb */
 	av = rxe_get_av(pkt);
+	if (!av) {
+		pr_warn("unable to get av\n");
+		return NULL;
+	}
 	skb = rxe_init_packet(rxe, av, paylen, pkt);
 	if (unlikely(!skb))
 		return NULL;
 
 	/* init bth */
-	solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
+	solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
 			(pkt->mask & RXE_END_MASK) &&
 			((pkt->mask & (RXE_SEND_MASK)) ||
 			(pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) ==
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 0a7d7c55d8d6..4d4ded90f793 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -506,6 +506,7 @@  static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr,
 	if (qp_type(qp) == IB_QPT_UD ||
 	    qp_type(qp) == IB_QPT_SMI ||
 	    qp_type(qp) == IB_QPT_GSI) {
+		wr->wr.ud.ah = ud_wr(ibwr)->ah;
 		wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn;
 		wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey;
 		if (qp_type(qp) == IB_QPT_GSI)
@@ -562,11 +563,6 @@  static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
 
 	init_send_wr(qp, &wqe->wr, ibwr);
 
-	if (qp_type(qp) == IB_QPT_UD ||
-	    qp_type(qp) == IB_QPT_SMI ||
-	    qp_type(qp) == IB_QPT_GSI)
-		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
-
 	if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) {
 		p = wqe->dma.inline_data;
 
diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h
index d8f2e0e46dab..d57271451052 100644
--- a/include/uapi/rdma/rdma_user_rxe.h
+++ b/include/uapi/rdma/rdma_user_rxe.h
@@ -89,6 +89,10 @@  struct rxe_send_wr {
 			__u32	reserved;
 		} atomic;
 		struct {
+			union {
+				__aligned_u64 ah_handle;
+				struct ib_ah *ah;
+			};
 			__u32	remote_qpn;
 			__u32	remote_qkey;
 			__u16	pkey_index;
@@ -132,7 +136,6 @@  struct rxe_dma_info {
 
 struct rxe_send_wqe {
 	struct rxe_send_wr	wr;
-	struct rxe_av		av;
 	__u32			status;
 	__u32			state;
 	__aligned_u64		iova;