diff mbox series

[for-next] RDMA/rxe: Guard mr state with spin lock

Message ID 20220725200114.2666-1-rpearsonhpe@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series [for-next] RDMA/rxe: Guard mr state with spin lock | expand

Commit Message

Bob Pearson July 25, 2022, 8:01 p.m. UTC
Currently the rxe driver does not guard changes to the mr state
against race conditions which can arise from races between
local operations and remote invalidate operations. This patch
adds a spinlock to the mr object and makes the state changes
atomic.

Applies to current for-next after adding the patch
Revert "RDMA/rxe: Create duplicate mapping tables for FMRs"

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_mr.c    | 62 +++++++++++++++++++--------
 drivers/infiniband/sw/rxe/rxe_resp.c  | 24 +++++++++--
 drivers/infiniband/sw/rxe/rxe_verbs.h |  3 ++
 3 files changed, 67 insertions(+), 22 deletions(-)

Comments

Jason Gunthorpe July 28, 2022, 4:42 p.m. UTC | #1
On Mon, Jul 25, 2022 at 03:01:15PM -0500, Bob Pearson wrote:
> Currently the rxe driver does not guard changes to the mr state
> against race conditions which can arise from races between
> local operations and remote invalidate operations. This patch
> adds a spinlock to the mr object and makes the state changes
> atomic.

This doesn't make it atomic..

> +	state = smp_load_acquire(&mr->state);
> +
>  	if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) ||
>  		     (type == RXE_LOOKUP_REMOTE && mr->rkey != key) ||
>  		     mr_pd(mr) != pd || (access && !(access & mr->access)) ||
> -		     mr->state != RXE_MR_STATE_VALID)) {
> +		     state != RXE_MR_STATE_VALID)) {
>  		rxe_put(mr);

This is still just differently racy

The whole point of invalidate is to say that when the invalidate
completion occurs there is absolutely no touching of the memory that
MR points to.

I don't see how this acheives this like this. You need a proper lock
spanning from the lookup here until all the "dma" is completed.

Jason
Bob Pearson July 28, 2022, 5:54 p.m. UTC | #2
On 7/28/22 11:42, Jason Gunthorpe wrote:
> On Mon, Jul 25, 2022 at 03:01:15PM -0500, Bob Pearson wrote:
>> Currently the rxe driver does not guard changes to the mr state
>> against race conditions which can arise from races between
>> local operations and remote invalidate operations. This patch
>> adds a spinlock to the mr object and makes the state changes
>> atomic.
> 
> This doesn't make it atomic..
> 
>> +	state = smp_load_acquire(&mr->state);
>> +
>>  	if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) ||
>>  		     (type == RXE_LOOKUP_REMOTE && mr->rkey != key) ||
>>  		     mr_pd(mr) != pd || (access && !(access & mr->access)) ||
>> -		     mr->state != RXE_MR_STATE_VALID)) {
>> +		     state != RXE_MR_STATE_VALID)) {
>>  		rxe_put(mr);
> 
> This is still just differently racy
> 
> The whole point of invalidate is to say that when the invalidate
> completion occurs there is absolutely no touching of the memory that
> MR points to.
> 
> I don't see how this acheives this like this. You need a proper lock
> spanning from the lookup here until all the "dma" is completed.
> 
> Jason

Interesting. Then things are in a bit of a mess. Before this patch of course there
was nothing. And, rxe_resp.c currently looks up an mr from the rkey and saves it
in the qp and then uses it for additional packets as required for e.g. rdma write
operations. A local invalidate before a multipacket write finishes will have the wrong
effect. It will continue to use the mr to perform the data copies. And the data copy
routine does not validate the mr state. We would have to save the rkey instead and
re-lookup the mr for each packet.

For a single packet we complete the dma in a single tasklet call. We would have a choice
of holding a spinlock (for a fairly long time) or marking the mr as busy and deferring a
local invalidate. A remote invalidate would fall between the packets of an rdma op.

Bob
Jason Gunthorpe July 29, 2022, 5:43 p.m. UTC | #3
On Thu, Jul 28, 2022 at 12:54:34PM -0500, Bob Pearson wrote:
> On 7/28/22 11:42, Jason Gunthorpe wrote:
> > On Mon, Jul 25, 2022 at 03:01:15PM -0500, Bob Pearson wrote:
> >> Currently the rxe driver does not guard changes to the mr state
> >> against race conditions which can arise from races between
> >> local operations and remote invalidate operations. This patch
> >> adds a spinlock to the mr object and makes the state changes
> >> atomic.
> > 
> > This doesn't make it atomic..
> > 
> >> +	state = smp_load_acquire(&mr->state);
> >> +
> >>  	if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) ||
> >>  		     (type == RXE_LOOKUP_REMOTE && mr->rkey != key) ||
> >>  		     mr_pd(mr) != pd || (access && !(access & mr->access)) ||
> >> -		     mr->state != RXE_MR_STATE_VALID)) {
> >> +		     state != RXE_MR_STATE_VALID)) {
> >>  		rxe_put(mr);
> > 
> > This is still just differently racy
> > 
> > The whole point of invalidate is to say that when the invalidate
> > completion occurs there is absolutely no touching of the memory that
> > MR points to.
> > 
> > I don't see how this acheives this like this. You need a proper lock
> > spanning from the lookup here until all the "dma" is completed.
> > 
> > Jason
> 
> Interesting. Then things are in a bit of a mess. Before this patch of course there
> was nothing. And, rxe_resp.c currently looks up an mr from the rkey and saves it
> in the qp and then uses it for additional packets as required for e.g. rdma write
> operations. A local invalidate before a multipacket write finishes will have the wrong
> effect. It will continue to use the mr to perform the data copies. And the data copy
> routine does not validate the mr state. We would have to save the rkey instead and
> re-lookup the mr for each packet.
> 
> For a single packet we complete the dma in a single tasklet call. We would have a choice
> of holding a spinlock (for a fairly long time) or marking the mr as busy and deferring a
> local invalidate. A remote invalidate would fall between the packets of an rdma op.

Some kind of "lock" that delays the invalidate completion until the MR
is finished being used is also fairly sensible.

In this case you'd probably convert the 'valid' into some kind of
refcount and when refcount_put reaches 0 then you'd complete any
pending invalidation WR.

So on the tasklet side it becomes two atomics - which is about the
same overhead as a spinlock.

Jason
Bob Pearson July 29, 2022, 5:50 p.m. UTC | #4
On 7/28/22 12:54, Bob Pearson wrote:
> On 7/28/22 11:42, Jason Gunthorpe wrote:
>> On Mon, Jul 25, 2022 at 03:01:15PM -0500, Bob Pearson wrote:
>>> Currently the rxe driver does not guard changes to the mr state
>>> against race conditions which can arise from races between
>>> local operations and remote invalidate operations. This patch
>>> adds a spinlock to the mr object and makes the state changes
>>> atomic.
>>
>> This doesn't make it atomic..
>>
>>> +	state = smp_load_acquire(&mr->state);
>>> +
>>>  	if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) ||
>>>  		     (type == RXE_LOOKUP_REMOTE && mr->rkey != key) ||
>>>  		     mr_pd(mr) != pd || (access && !(access & mr->access)) ||
>>> -		     mr->state != RXE_MR_STATE_VALID)) {
>>> +		     state != RXE_MR_STATE_VALID)) {
>>>  		rxe_put(mr);
>>
>> This is still just differently racy
>>
>> The whole point of invalidate is to say that when the invalidate
>> completion occurs there is absolutely no touching of the memory that
>> MR points to.
>>
>> I don't see how this acheives this like this. You need a proper lock
>> spanning from the lookup here until all the "dma" is completed.
>>
>> Jason
> 
> Interesting. Then things are in a bit of a mess. Before this patch of course there
> was nothing. And, rxe_resp.c currently looks up an mr from the rkey and saves it
> in the qp and then uses it for additional packets as required for e.g. rdma write
> operations. A local invalidate before a multipacket write finishes will have the wrong
> effect. It will continue to use the mr to perform the data copies. And the data copy
> routine does not validate the mr state. We would have to save the rkey instead and
> re-lookup the mr for each packet.
> 
> For a single packet we complete the dma in a single tasklet call. We would have a choice
> of holding a spinlock (for a fairly long time) or marking the mr as busy and deferring a
> local invalidate. A remote invalidate would fall between the packets of an rdma op.
> 
> Bob

Just rechecked all this and it isn't bad. The rxe responder only saves the mr during a single
packet processing cycle. Still not locked against races but at least no major surgery needed
for rdma writes.

Bob
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 850b80f5ad8b..1dd849eb14dd 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -61,7 +61,8 @@  static void rxe_mr_init(int access, struct rxe_mr *mr)
 	mr->lkey = mr->ibmr.lkey = lkey;
 	mr->rkey = mr->ibmr.rkey = rkey;
 
-	mr->state = RXE_MR_STATE_INVALID;
+	spin_lock_init(&mr->lock);
+
 	mr->map_shift = ilog2(RXE_BUF_PER_MAP);
 }
 
@@ -109,7 +110,7 @@  void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
 
 	mr->ibmr.pd = &pd->ibpd;
 	mr->access = access;
-	mr->state = RXE_MR_STATE_VALID;
+	smp_store_release(&mr->state, RXE_MR_STATE_VALID);
 	mr->type = IB_MR_TYPE_DMA;
 }
 
@@ -182,7 +183,7 @@  int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
 	mr->iova = iova;
 	mr->va = start;
 	mr->offset = ib_umem_offset(umem);
-	mr->state = RXE_MR_STATE_VALID;
+	smp_store_release(&mr->state, RXE_MR_STATE_VALID);
 	mr->type = IB_MR_TYPE_USER;
 
 	return 0;
@@ -210,7 +211,7 @@  int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr)
 
 	mr->ibmr.pd = &pd->ibpd;
 	mr->max_buf = max_pages;
-	mr->state = RXE_MR_STATE_FREE;
+	smp_store_release(&mr->state, RXE_MR_STATE_FREE);
 	mr->type = IB_MR_TYPE_MEM_REG;
 
 	return 0;
@@ -260,8 +261,11 @@  void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length)
 	size_t offset;
 	int m, n;
 	void *addr;
+	enum rxe_mr_state state;
+
+	state = smp_load_acquire(&mr->state);
 
-	if (mr->state != RXE_MR_STATE_VALID) {
+	if (state != RXE_MR_STATE_VALID) {
 		pr_warn("mr not in valid state\n");
 		addr = NULL;
 		goto out;
@@ -510,15 +514,18 @@  struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
 	struct rxe_mr *mr;
 	struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
 	int index = key >> 8;
+	enum rxe_mr_state state;
 
 	mr = rxe_pool_get_index(&rxe->mr_pool, index);
 	if (!mr)
 		return NULL;
 
+	state = smp_load_acquire(&mr->state);
+
 	if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) ||
 		     (type == RXE_LOOKUP_REMOTE && mr->rkey != key) ||
 		     mr_pd(mr) != pd || (access && !(access & mr->access)) ||
-		     mr->state != RXE_MR_STATE_VALID)) {
+		     state != RXE_MR_STATE_VALID)) {
 		rxe_put(mr);
 		mr = NULL;
 	}
@@ -559,7 +566,18 @@  int rxe_invalidate_mr(struct rxe_qp *qp, u32 key)
 		goto err_drop_ref;
 	}
 
-	mr->state = RXE_MR_STATE_FREE;
+	spin_lock_bh(&mr->lock);
+	if (mr->state == RXE_MR_STATE_INVALID) {
+		spin_unlock_bh(&mr->lock);
+		pr_debug("%s: Attempt to invalidate mr#%d in INVALID state\n",
+			__func__, mr->elem.index);
+		ret = -EINVAL;
+		goto err_drop_ref;
+	} else {
+		mr->state = RXE_MR_STATE_FREE;
+	}
+	spin_unlock_bh(&mr->lock);
+
 	ret = 0;
 
 err_drop_ref:
@@ -581,13 +599,6 @@  int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
 	u32 key = wqe->wr.wr.reg.key;
 	u32 access = wqe->wr.wr.reg.access;
 
-	/* user can only register MR in free state */
-	if (unlikely(mr->state != RXE_MR_STATE_FREE)) {
-		pr_warn("%s: mr->lkey = 0x%x not free\n",
-			__func__, mr->lkey);
-		return -EINVAL;
-	}
-
 	/* user can only register mr with qp in same protection domain */
 	if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) {
 		pr_warn("%s: qp->pd and mr->pd don't match\n",
@@ -602,11 +613,20 @@  int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
 		return -EINVAL;
 	}
 
-	mr->access = access;
-	mr->lkey = key;
-	mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0;
-	mr->iova = wqe->wr.wr.reg.mr->iova;
-	mr->state = RXE_MR_STATE_VALID;
+	spin_lock_bh(&mr->lock);
+	if (mr->state == RXE_MR_STATE_FREE) {
+		mr->access = access;
+		mr->lkey = key;
+		mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0;
+		mr->iova = wqe->wr.wr.reg.mr->iova;
+		mr->state = RXE_MR_STATE_VALID;
+	} else {
+		spin_unlock_bh(&mr->lock);
+		pr_debug("%s: mr#%d not in FREE state\n",
+			__func__, mr->elem.index);
+		return -EINVAL;
+	}
+	spin_unlock_bh(&mr->lock);
 
 	return 0;
 }
@@ -619,6 +639,10 @@  int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	if (atomic_read(&mr->num_mw) > 0)
 		return -EINVAL;
 
+	spin_lock_bh(&mr->lock);
+	mr->state = RXE_MR_STATE_INVALID;
+	spin_unlock_bh(&mr->lock);
+
 	rxe_cleanup(mr);
 
 	return 0;
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index b36ec5c4d5e0..5ff0cdce24d2 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -602,15 +602,20 @@  static enum resp_states atomic_reply(struct rxe_qp *qp,
 	enum resp_states ret;
 	struct rxe_mr *mr = qp->resp.mr;
 	struct resp_res *res = qp->resp.res;
+	enum rxe_mr_state state;
 	u64 value;
 
+	spin_lock_bh(&mr->lock);
+	state = mr->state;
+	spin_unlock_bh(&mr->lock);
+
 	if (!res) {
 		res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK);
 		qp->resp.res = res;
 	}
 
 	if (!res->replay) {
-		if (mr->state != RXE_MR_STATE_VALID) {
+		if (state != RXE_MR_STATE_VALID) {
 			ret = RESPST_ERR_RKEY_VIOLATION;
 			goto out;
 		}
@@ -723,6 +728,7 @@  static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
 	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 	struct rxe_mr *mr;
 	struct rxe_mw *mw;
+	enum rxe_mr_state state;
 
 	if (rkey_is_mw(rkey)) {
 		mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8);
@@ -730,8 +736,16 @@  static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
 			return NULL;
 
 		mr = mw->mr;
+		if (mr) {
+			spin_lock_bh(&mr->lock);
+			state = mr->state;
+			spin_unlock_bh(&mr->lock);
+		} else {
+			state = RXE_MR_STATE_INVALID;
+		}
+
 		if (mw->rkey != rkey || mw->state != RXE_MW_STATE_VALID ||
-		    !mr || mr->state != RXE_MR_STATE_VALID) {
+		    !mr || state != RXE_MR_STATE_VALID) {
 			rxe_put(mw);
 			return NULL;
 		}
@@ -746,7 +760,11 @@  static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
 	if (!mr)
 		return NULL;
 
-	if (mr->rkey != rkey || mr->state != RXE_MR_STATE_VALID) {
+	spin_lock_bh(&mr->lock);
+	state = mr->state;
+	spin_unlock_bh(&mr->lock);
+
+	if (mr->rkey != rkey || state != RXE_MR_STATE_VALID) {
 		rxe_put(mr);
 		return NULL;
 	}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index a24fbe984066..2dbc754c3faf 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -302,7 +302,10 @@  struct rxe_mr {
 
 	u32			lkey;
 	u32			rkey;
+
 	enum rxe_mr_state	state;
+	spinlock_t		lock;	/* guard state */
+
 	enum ib_mr_type		type;
 	u64			va;
 	u64			iova;