diff mbox series

[for-next,2/2] RDMA/rxe: Enable rcu locking of indexed objects

Message ID 20230718175943.16734-3-rpearsonhpe@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series Enable rcu locking of verbs objects | expand

Commit Message

Bob Pearson July 18, 2023, 5:59 p.m. UTC
Make rcu_read locking of critical sections with the indexed
verbs objects be protected from early freeing of those objects.
The AH, QP, MR and MW objects are looked up from their indices
contained in received packets or WQEs during I/O processing.
Make these objects be freed using kfree_rcu to avoid races.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.h  | 1 +
 drivers/infiniband/sw/rxe/rxe_verbs.c | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

Comments

Zhu Yanjun July 19, 2023, 5:38 a.m. UTC | #1
On Wed, Jul 19, 2023 at 2:00 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>
> Make rcu_read locking of critical sections with the indexed
> verbs objects be protected from early freeing of those objects.
> The AH, QP, MR and MW objects are looked up from their indices
> contained in received packets or WQEs during I/O processing.
> Make these objects be freed using kfree_rcu to avoid races.
>
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_pool.h  | 1 +
>  drivers/infiniband/sw/rxe/rxe_verbs.c | 6 +++++-
>  2 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
> index b42e26427a70..ef2f6f88e254 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.h
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.h
> @@ -25,6 +25,7 @@ struct rxe_pool_elem {
>         struct kref             ref_cnt;
>         struct list_head        list;
>         struct completion       complete;
> +       struct rcu_head         rcu;
>         u32                     index;
>  };
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 903f0b71447e..b899924b81eb 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1395,7 +1395,7 @@ static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
>         if (cleanup_err)
>                 rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err);
>
> -       kfree_rcu_mightsleep(mr);
> +       kfree_rcu(mr, elem.rcu);
>         return 0;
>
>  err_out:
> @@ -1494,6 +1494,10 @@ static const struct ib_device_ops rxe_dev_ops = {
>         INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
>         INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
>         INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
> +
> +       .rcu_offset_ah = offsetof(struct rxe_ah, elem.rcu),
> +       .rcu_offset_qp = offsetof(struct rxe_qp, elem.rcu),
> +       .rcu_offset_mw = offsetof(struct rxe_mw, elem.rcu),

In your commits, you mentioned that ah, qp, mw and mr will be
protected. But here, only ah, qp and mw are set.
Not sure if mr is also protected and set in other place.
Except that, I am fine with it.

Acked-by: Zhu Yanjun <zyjzyj2000@gmail.com>

Zhu Yanjun

>  };
>
>  int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
> --
> 2.39.2
>
Leon Romanovsky July 19, 2023, 7:49 a.m. UTC | #2
On Tue, Jul 18, 2023 at 12:59:44PM -0500, Bob Pearson wrote:
> Make rcu_read locking of critical sections with the indexed
> verbs objects be protected from early freeing of those objects.
> The AH, QP, MR and MW objects are looked up from their indices
> contained in received packets or WQEs during I/O processing.
> Make these objects be freed using kfree_rcu to avoid races.

Sorry, how use of RCU avoid races?

Thanks
Bob Pearson July 19, 2023, 2:34 p.m. UTC | #3
On 7/19/23 00:38, Zhu Yanjun wrote:
> On Wed, Jul 19, 2023 at 2:00 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>>
>> Make rcu_read locking of critical sections with the indexed
>> verbs objects be protected from early freeing of those objects.
>> The AH, QP, MR and MW objects are looked up from their indices
>> contained in received packets or WQEs during I/O processing.
>> Make these objects be freed using kfree_rcu to avoid races.
>>
>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>> ---
>>  drivers/infiniband/sw/rxe/rxe_pool.h  | 1 +
>>  drivers/infiniband/sw/rxe/rxe_verbs.c | 6 +++++-
>>  2 files changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
>> index b42e26427a70..ef2f6f88e254 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_pool.h
>> +++ b/drivers/infiniband/sw/rxe/rxe_pool.h
>> @@ -25,6 +25,7 @@ struct rxe_pool_elem {
>>         struct kref             ref_cnt;
>>         struct list_head        list;
>>         struct completion       complete;
>> +       struct rcu_head         rcu;
>>         u32                     index;
>>  };
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
>> index 903f0b71447e..b899924b81eb 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
>> @@ -1395,7 +1395,7 @@ static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
>>         if (cleanup_err)
>>                 rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err);
>>
>> -       kfree_rcu_mightsleep(mr);
>> +       kfree_rcu(mr, elem.rcu);
>>         return 0;
>>
>>  err_out:
>> @@ -1494,6 +1494,10 @@ static const struct ib_device_ops rxe_dev_ops = {
>>         INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
>>         INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
>>         INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
>> +
>> +       .rcu_offset_ah = offsetof(struct rxe_ah, elem.rcu),
>> +       .rcu_offset_qp = offsetof(struct rxe_qp, elem.rcu),
>> +       .rcu_offset_mw = offsetof(struct rxe_mw, elem.rcu),
> 
> In your commits, you mentioned that ah, qp, mw and mr will be
> protected. But here, only ah, qp and mw are set.
> Not sure if mr is also protected and set in other place.
> Except that, I am fine with it.
> 
> Acked-by: Zhu Yanjun <zyjzyj2000@gmail.com>
> 
> Zhu Yanjun
> 
>>  };
>>
>>  int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
>> --
>> 2.39.2
>>

All of the verbs objects except MR are allocated and freed by rdma-core. MR is allocated
and freed by the drivers. So for MR the kfree_rcu call is made in rxe_dereg_mr().
I changed it from kfree_rcu_mightsleep to the kfree_rcu variant to match the three others.

Bob
Bob Pearson July 19, 2023, 4:43 p.m. UTC | #4
On 7/19/23 02:49, Leon Romanovsky wrote:
> On Tue, Jul 18, 2023 at 12:59:44PM -0500, Bob Pearson wrote:
>> Make rcu_read locking of critical sections with the indexed
>> verbs objects be protected from early freeing of those objects.
>> The AH, QP, MR and MW objects are looked up from their indices
>> contained in received packets or WQEs during I/O processing.
>> Make these objects be freed using kfree_rcu to avoid races.
> 
> Sorry, how use of RCU avoid races?
> 
> Thanks

The races are between destroy/deallocate/dereg verbs API calls and packets arriving or completing send
or deferred processing of wqes. Packets and wqes contain indices/keys/numbers that refer to objects.
The rxe driver maintains xarrays for each type of object that allow to lookup the address of the object
from its index and then take a reference to protect the pointer. The destroy verbs defer completion
until the reference count falls to zero and then removes the entry in the xarray. These operations
need to be atomic. One alternative is to use spinlocks to protect them but that places a load on
performance under heavy load which is typically dominated by the lookup function since objects tend
to have a long lifetime. rcu readlocks are a better alternative but depend on the deferred destruction
of the objects used in the rcu critical section. In certain benchmarks the use of rcu locking has
a very large performance advantage over spinlocks. For example 100s of QPs running over a 200 gbit/s
network.

Bob
Leon Romanovsky July 23, 2023, 3 p.m. UTC | #5
On Wed, Jul 19, 2023 at 11:43:30AM -0500, Bob Pearson wrote:
> On 7/19/23 02:49, Leon Romanovsky wrote:
> > On Tue, Jul 18, 2023 at 12:59:44PM -0500, Bob Pearson wrote:
> >> Make rcu_read locking of critical sections with the indexed
> >> verbs objects be protected from early freeing of those objects.
> >> The AH, QP, MR and MW objects are looked up from their indices
> >> contained in received packets or WQEs during I/O processing.
> >> Make these objects be freed using kfree_rcu to avoid races.
> > 
> > Sorry, how use of RCU avoid races?
> > 
> > Thanks
> 
> The races are between destroy/deallocate/dereg verbs API calls and packets arriving or completing send
> or deferred processing of wqes. Packets and wqes contain indices/keys/numbers that refer to objects.
> The rxe driver maintains xarrays for each type of object that allow to lookup the address of the object
> from its index and then take a reference to protect the pointer. The destroy verbs defer completion
> until the reference count falls to zero and then removes the entry in the xarray. These operations
> need to be atomic. One alternative is to use spinlocks to protect them but that places a load on
> performance under heavy load which is typically dominated by the lookup function since objects tend
> to have a long lifetime. rcu readlocks are a better alternative but depend on the deferred destruction
> of the objects used in the rcu critical section. 

You rarely can replace locks with RCU without careful design changes.

Thanks
Jason Gunthorpe Aug. 9, 2023, 7:17 p.m. UTC | #6
On Tue, Jul 18, 2023 at 12:59:44PM -0500, Bob Pearson wrote:
> Make rcu_read locking of critical sections with the indexed
> verbs objects be protected from early freeing of those objects.
> The AH, QP, MR and MW objects are looked up from their indices
> contained in received packets or WQEs during I/O processing.
> Make these objects be freed using kfree_rcu to avoid races.
> 
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_pool.h  | 1 +
>  drivers/infiniband/sw/rxe/rxe_verbs.c | 6 +++++-
>  2 files changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
> index b42e26427a70..ef2f6f88e254 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.h
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.h
> @@ -25,6 +25,7 @@ struct rxe_pool_elem {
>  	struct kref		ref_cnt;
>  	struct list_head	list;
>  	struct completion	complete;
> +	struct rcu_head		rcu;
>  	u32			index;
>  };
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 903f0b71447e..b899924b81eb 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1395,7 +1395,7 @@ static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
>  	if (cleanup_err)
>  		rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err);
>  
> -	kfree_rcu_mightsleep(mr);
> +	kfree_rcu(mr, elem.rcu);
>  	return 0;
>  
>  err_out:
> @@ -1494,6 +1494,10 @@ static const struct ib_device_ops rxe_dev_ops = {
>  	INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
>  	INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
>  	INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
> +
> +	.rcu_offset_ah = offsetof(struct rxe_ah, elem.rcu),
> +	.rcu_offset_qp = offsetof(struct rxe_qp, elem.rcu),
> +	.rcu_offset_mw = offsetof(struct rxe_mw, elem.rcu),
>  };

This isn't quite the right calculation.. It works because the ib
struct is at the top of these structs

Do it like this:

/*
 * When used the core will RCU free the object using the driver provided
 * rcu member.
 */
#define INIT_RDMA_OBJ_SIZE_RCU(ib_struct, drv_struct, member, rcu_member)    \
	.rcu_offset_##ib_struct =                                            \
		(offsetof(struct drv_struct, rcu_member) -                   \
		 offsetof(struct drv_struct, member) +                       \
		 BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, rcu_member) > \
				   offsetof(struct drv_struct, member))),    \
	INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member),

(and maybe the prior patch could stick with the ssize_t and we could
remove the BUILD_BUG_ON_ZERO?)

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index b42e26427a70..ef2f6f88e254 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -25,6 +25,7 @@  struct rxe_pool_elem {
 	struct kref		ref_cnt;
 	struct list_head	list;
 	struct completion	complete;
+	struct rcu_head		rcu;
 	u32			index;
 };
 
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 903f0b71447e..b899924b81eb 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1395,7 +1395,7 @@  static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	if (cleanup_err)
 		rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err);
 
-	kfree_rcu_mightsleep(mr);
+	kfree_rcu(mr, elem.rcu);
 	return 0;
 
 err_out:
@@ -1494,6 +1494,10 @@  static const struct ib_device_ops rxe_dev_ops = {
 	INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
 	INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
+
+	.rcu_offset_ah = offsetof(struct rxe_ah, elem.rcu),
+	.rcu_offset_qp = offsetof(struct rxe_qp, elem.rcu),
+	.rcu_offset_mw = offsetof(struct rxe_mw, elem.rcu),
 };
 
 int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)