diff mbox series

[for-next] RDMA/rxe: Fix potential race in rxe_pool_get_index

Message ID 20230629223023.84008-1-rpearsonhpe@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series [for-next] RDMA/rxe: Fix potential race in rxe_pool_get_index | expand

Commit Message

Bob Pearson June 29, 2023, 10:30 p.m. UTC
Currently the lookup of an object from its index and taking a
reference to the object are incorrectly protected by an rcu_read_lock
but this does not make the xa_load and the kref_get combination an
atomic operation.

The various write operations need to share the xarray state in a
mixture of user, soft irq and hard irq contexts so the xa_locking
must support that.

This patch replaces the xa locks with xa_lock_irqsave.

Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by xarrays")
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)


base-commit: 5f004bcaee4cb552cf1b46a505f18f08777db7e5

Comments

Jason Gunthorpe June 29, 2023, 11:18 p.m. UTC | #1
On Thu, Jun 29, 2023 at 05:30:24PM -0500, Bob Pearson wrote:
> Currently the lookup of an object from its index and taking a
> reference to the object are incorrectly protected by an rcu_read_lock
> but this does not make the xa_load and the kref_get combination an
> atomic operation.
> 
> The various write operations need to share the xarray state in a
> mixture of user, soft irq and hard irq contexts so the xa_locking
> must support that.
> 
> This patch replaces the xa locks with xa_lock_irqsave.
> 
> Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by xarrays")
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_pool.c | 24 ++++++++++++++++++------
>  1 file changed, 18 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> index 6215c6de3a84..f2b586249793 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> @@ -119,8 +119,10 @@ void rxe_pool_cleanup(struct rxe_pool *pool)
>  int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
>  				bool sleepable)
>  {
> -	int err;
> +	struct xarray *xa = &pool->xa;
> +	unsigned long flags;
>  	gfp_t gfp_flags;
> +	int err;
>  
>  	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>  		goto err_cnt;
> @@ -138,8 +140,10 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
>  
>  	if (sleepable)
>  		might_sleep();
> -	err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
> +	xa_lock_irqsave(xa, flags);
> +	err = __xa_alloc_cyclic(xa, &elem->index, NULL, pool->limit,
>  			      &pool->next, gfp_flags);
> +	xa_unlock_irqrestore(xa, flags);

This stuff doesn't make any sense, the non __ versions already take
the xa_lock internally.

Or is this because you need the save/restore version for some reason?
But that seems unrelated and there should be a lockdep oops to go
along with it showing the backtrace??

> @@ -154,15 +158,16 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
>  {
>  	struct rxe_pool_elem *elem;
>  	struct xarray *xa = &pool->xa;
> +	unsigned long flags;
>  	void *obj;
>  
> -	rcu_read_lock();
> +	xa_lock_irqsave(xa, flags);
>  	elem = xa_load(xa, index);
>  	if (elem && kref_get_unless_zero(&elem->ref_cnt))
>  		obj = elem->obj;
>  	else
>  		obj = NULL;
> -	rcu_read_unlock();
> +	xa_unlock_irqrestore(xa, flags);

And this should be safe as long as the object is freed via RCU, so
what are you trying to fix?

Jason
Bob Pearson June 30, 2023, 3:33 p.m. UTC | #2
On 6/29/23 18:18, Jason Gunthorpe wrote:
> On Thu, Jun 29, 2023 at 05:30:24PM -0500, Bob Pearson wrote:
>> Currently the lookup of an object from its index and taking a
>> reference to the object are incorrectly protected by an rcu_read_lock
>> but this does not make the xa_load and the kref_get combination an
>> atomic operation.
>>
>> The various write operations need to share the xarray state in a
>> mixture of user, soft irq and hard irq contexts so the xa_locking
>> must support that.
>>
>> This patch replaces the xa locks with xa_lock_irqsave.
>>
>> Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by xarrays")
>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>> ---
>>  drivers/infiniband/sw/rxe/rxe_pool.c | 24 ++++++++++++++++++------
>>  1 file changed, 18 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
>> index 6215c6de3a84..f2b586249793 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
>> @@ -119,8 +119,10 @@ void rxe_pool_cleanup(struct rxe_pool *pool)
>>  int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
>>  				bool sleepable)
>>  {
>> -	int err;
>> +	struct xarray *xa = &pool->xa;
>> +	unsigned long flags;
>>  	gfp_t gfp_flags;
>> +	int err;
>>  
>>  	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>>  		goto err_cnt;
>> @@ -138,8 +140,10 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
>>  
>>  	if (sleepable)
>>  		might_sleep();
>> -	err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
>> +	xa_lock_irqsave(xa, flags);
>> +	err = __xa_alloc_cyclic(xa, &elem->index, NULL, pool->limit,
>>  			      &pool->next, gfp_flags);
>> +	xa_unlock_irqrestore(xa, flags);
> 
> This stuff doesn't make any sense, the non __ versions already take
> the xa_lock internally.
> 
> Or is this because you need the save/restore version for some reason?
> But that seems unrelated and there should be a lockdep oops to go
> along with it showing the backtrace??

The background here is that we are testing a 256 node system with the Lustre file system and doing
fail-over-fail-back testing which is very high stress. This has uncovered several bugs where this is
just one.
The logic is 1st need to lock the lookup in rxe_pool_get_index() then when we tried to run
with ordinary spin_locks we got lots of deadlocks. These are related to taking spin locks while in
(soft irq) interrupt mode. In theory we could also get called in hard irq mode so might as well
convert the locks to spin_lock_irqsave() which is safe in all cases.
> 
>> @@ -154,15 +158,16 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
>>  {
>>  	struct rxe_pool_elem *elem;
>>  	struct xarray *xa = &pool->xa;
>> +	unsigned long flags;
>>  	void *obj;
>>  
>> -	rcu_read_lock();
>> +	xa_lock_irqsave(xa, flags);
>>  	elem = xa_load(xa, index);
>>  	if (elem && kref_get_unless_zero(&elem->ref_cnt))
>>  		obj = elem->obj;
>>  	else
>>  		obj = NULL;
>> -	rcu_read_unlock();
>> +	xa_unlock_irqrestore(xa, flags);
> 
> And this should be safe as long as the object is freed via RCU, so
> what are you trying to fix?

The problem here is that rcu_read_lock only helps us if the object is freed with kfree_rcu.
But we have no control over what rdma-core does and it does *not* do that for e.g. qp's.
If this code gets scheduled off the cpu between the xa_load and the kref_get the object can
be completed cleaned up and freed by rdma-core. It isn't likely but it definitely could happen.
This results in a seg fault in kref_get. This is safer and keeps us out of Leon's hair.

Bob
> 
> Jason
Jason Gunthorpe July 10, 2023, 4:47 p.m. UTC | #3
On Fri, Jun 30, 2023 at 10:33:38AM -0500, Bob Pearson wrote:
> On 6/29/23 18:18, Jason Gunthorpe wrote:
> > On Thu, Jun 29, 2023 at 05:30:24PM -0500, Bob Pearson wrote:
> >> Currently the lookup of an object from its index and taking a
> >> reference to the object are incorrectly protected by an rcu_read_lock
> >> but this does not make the xa_load and the kref_get combination an
> >> atomic operation.
> >>
> >> The various write operations need to share the xarray state in a
> >> mixture of user, soft irq and hard irq contexts so the xa_locking
> >> must support that.
> >>
> >> This patch replaces the xa locks with xa_lock_irqsave.
> >>
> >> Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by xarrays")
> >> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> >> ---
> >>  drivers/infiniband/sw/rxe/rxe_pool.c | 24 ++++++++++++++++++------
> >>  1 file changed, 18 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> >> index 6215c6de3a84..f2b586249793 100644
> >> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
> >> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> >> @@ -119,8 +119,10 @@ void rxe_pool_cleanup(struct rxe_pool *pool)
> >>  int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
> >>  				bool sleepable)
> >>  {
> >> -	int err;
> >> +	struct xarray *xa = &pool->xa;
> >> +	unsigned long flags;
> >>  	gfp_t gfp_flags;
> >> +	int err;
> >>  
> >>  	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
> >>  		goto err_cnt;
> >> @@ -138,8 +140,10 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
> >>  
> >>  	if (sleepable)
> >>  		might_sleep();
> >> -	err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
> >> +	xa_lock_irqsave(xa, flags);
> >> +	err = __xa_alloc_cyclic(xa, &elem->index, NULL, pool->limit,
> >>  			      &pool->next, gfp_flags);
> >> +	xa_unlock_irqrestore(xa, flags);
> > 
> > This stuff doesn't make any sense, the non __ versions already take
> > the xa_lock internally.
> > 
> > Or is this because you need the save/restore version for some reason?
> > But that seems unrelated and there should be a lockdep oops to go
> > along with it showing the backtrace??
> 
> The background here is that we are testing a 256 node system with
> the Lustre file system and doing fail-over-fail-back testing which
> is very high stress. This has uncovered several bugs where this is
> just one.

> The logic is 1st need to lock the lookup in rxe_pool_get_index()
> then when we tried to run with ordinary spin_locks we got lots of
> deadlocks. These are related to taking spin locks while in (soft
> irq) interrupt mode. In theory we could also get called in hard irq
> mode so might as well convert the locks to spin_lock_irqsave() which
> is safe in all cases.

That should be its own patch with justification..
 
> >> @@ -154,15 +158,16 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
> >>  {
> >>  	struct rxe_pool_elem *elem;
> >>  	struct xarray *xa = &pool->xa;
> >> +	unsigned long flags;
> >>  	void *obj;
> >>  
> >> -	rcu_read_lock();
> >> +	xa_lock_irqsave(xa, flags);
> >>  	elem = xa_load(xa, index);
> >>  	if (elem && kref_get_unless_zero(&elem->ref_cnt))
> >>  		obj = elem->obj;
> >>  	else
> >>  		obj = NULL;
> >> -	rcu_read_unlock();
> >> +	xa_unlock_irqrestore(xa, flags);
> > 
> > And this should be safe as long as the object is freed via RCU, so
> > what are you trying to fix?
> 
> The problem here is that rcu_read_lock only helps us if the object is freed with kfree_rcu.
> But we have no control over what rdma-core does and it does *not* do
> that for e.g. qp's.

Oh, yes that does sound right. This is another patch with this
explanation.

Jason
Bob Pearson July 10, 2023, 6:11 p.m. UTC | #4
On 7/10/23 11:47, Jason Gunthorpe wrote:
> On Fri, Jun 30, 2023 at 10:33:38AM -0500, Bob Pearson wrote:
>> On 6/29/23 18:18, Jason Gunthorpe wrote:
>>> On Thu, Jun 29, 2023 at 05:30:24PM -0500, Bob Pearson wrote:
>>>> Currently the lookup of an object from its index and taking a
>>>> reference to the object are incorrectly protected by an rcu_read_lock
>>>> but this does not make the xa_load and the kref_get combination an
>>>> atomic operation.
>>>>
>>>> The various write operations need to share the xarray state in a
>>>> mixture of user, soft irq and hard irq contexts so the xa_locking
>>>> must support that.
>>>>
>>>> This patch replaces the xa locks with xa_lock_irqsave.
>>>>
>>>> Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by xarrays")
>>>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>>>> ---
>>>>  drivers/infiniband/sw/rxe/rxe_pool.c | 24 ++++++++++++++++++------
>>>>  1 file changed, 18 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
>>>> index 6215c6de3a84..f2b586249793 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
>>>> @@ -119,8 +119,10 @@ void rxe_pool_cleanup(struct rxe_pool *pool)
>>>>  int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
>>>>  				bool sleepable)
>>>>  {
>>>> -	int err;
>>>> +	struct xarray *xa = &pool->xa;
>>>> +	unsigned long flags;
>>>>  	gfp_t gfp_flags;
>>>> +	int err;
>>>>  
>>>>  	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>>>>  		goto err_cnt;
>>>> @@ -138,8 +140,10 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
>>>>  
>>>>  	if (sleepable)
>>>>  		might_sleep();
>>>> -	err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
>>>> +	xa_lock_irqsave(xa, flags);
>>>> +	err = __xa_alloc_cyclic(xa, &elem->index, NULL, pool->limit,
>>>>  			      &pool->next, gfp_flags);
>>>> +	xa_unlock_irqrestore(xa, flags);
>>>
>>> This stuff doesn't make any sense, the non __ versions already take
>>> the xa_lock internally.
>>>
>>> Or is this because you need the save/restore version for some reason?
>>> But that seems unrelated and there should be a lockdep oops to go
>>> along with it showing the backtrace??
>>
>> The background here is that we are testing a 256 node system with
>> the Lustre file system and doing fail-over-fail-back testing which
>> is very high stress. This has uncovered several bugs where this is
>> just one.
> 
>> The logic is 1st need to lock the lookup in rxe_pool_get_index()
>> then when we tried to run with ordinary spin_locks we got lots of
>> deadlocks. These are related to taking spin locks while in (soft
>> irq) interrupt mode. In theory we could also get called in hard irq
>> mode so might as well convert the locks to spin_lock_irqsave() which
>> is safe in all cases.
> 
> That should be its own patch with justification..
>  
>>>> @@ -154,15 +158,16 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
>>>>  {
>>>>  	struct rxe_pool_elem *elem;
>>>>  	struct xarray *xa = &pool->xa;
>>>> +	unsigned long flags;
>>>>  	void *obj;
>>>>  
>>>> -	rcu_read_lock();
>>>> +	xa_lock_irqsave(xa, flags);
>>>>  	elem = xa_load(xa, index);
>>>>  	if (elem && kref_get_unless_zero(&elem->ref_cnt))
>>>>  		obj = elem->obj;
>>>>  	else
>>>>  		obj = NULL;
>>>> -	rcu_read_unlock();
>>>> +	xa_unlock_irqrestore(xa, flags);
>>>
>>> And this should be safe as long as the object is freed via RCU, so
>>> what are you trying to fix?
>>
>> The problem here is that rcu_read_lock only helps us if the object is freed with kfree_rcu.
>> But we have no control over what rdma-core does and it does *not* do
>> that for e.g. qp's.
> 
> Oh, yes that does sound right. This is another patch with this
> explanation.
> 
> Jason

New news on this. After some testing it turns out that replacing rcu_read_lock() by xa_lock_irqsave()
in rxe_pool_get_index() with a large number of QPs has very bad performance. ib_send_bw -q 32 spends
about 40% of its time trying to get the spinlock on a 24 thread CPU with local loopback. With rcu_read_lock
performance is what I expect. So, since we don't actually see this race we are reverting that change.
Without it the irqsave locks aren't required either. So for now please ignore this patch.

The only way to address this risk without compromising performance would be to find a way to indicate to
rdma-core that it should use kfree_rcu instead of kfree if the driver needs it. A bit mask indexed by
object type exchanged at ib_register_device() would cover it.

Bob
Jason Gunthorpe July 10, 2023, 6:15 p.m. UTC | #5
On Mon, Jul 10, 2023 at 01:11:28PM -0500, Bob Pearson wrote:

> New news on this. After some testing it turns out that replacing
> rcu_read_lock() by xa_lock_irqsave() in rxe_pool_get_index() with a
> large number of QPs has very bad performance. ib_send_bw -q 32
> spends about 40% of its time trying to get the spinlock on a 24
> thread CPU with local loopback. With rcu_read_lock performance is
> what I expect. So, since we don't actually see this race we are
> reverting that change.  Without it the irqsave locks aren't required
> either. So for now please ignore this patch.

Well, no, you need to fix this now that you found it - you need to
make the core code RCU free things that rxe thinks are rcu protected.

And maybe we should just have the core code do it always, we've wanted
RCU objects for other reasons anyhow.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 6215c6de3a84..f2b586249793 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -119,8 +119,10 @@  void rxe_pool_cleanup(struct rxe_pool *pool)
 int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
 				bool sleepable)
 {
-	int err;
+	struct xarray *xa = &pool->xa;
+	unsigned long flags;
 	gfp_t gfp_flags;
+	int err;
 
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
 		goto err_cnt;
@@ -138,8 +140,10 @@  int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
 
 	if (sleepable)
 		might_sleep();
-	err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
+	xa_lock_irqsave(xa, flags);
+	err = __xa_alloc_cyclic(xa, &elem->index, NULL, pool->limit,
 			      &pool->next, gfp_flags);
+	xa_unlock_irqrestore(xa, flags);
 	if (err < 0)
 		goto err_cnt;
 
@@ -154,15 +158,16 @@  void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
 {
 	struct rxe_pool_elem *elem;
 	struct xarray *xa = &pool->xa;
+	unsigned long flags;
 	void *obj;
 
-	rcu_read_lock();
+	xa_lock_irqsave(xa, flags);
 	elem = xa_load(xa, index);
 	if (elem && kref_get_unless_zero(&elem->ref_cnt))
 		obj = elem->obj;
 	else
 		obj = NULL;
-	rcu_read_unlock();
+	xa_unlock_irqrestore(xa, flags);
 
 	return obj;
 }
@@ -179,6 +184,7 @@  int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
 	struct rxe_pool *pool = elem->pool;
 	struct xarray *xa = &pool->xa;
 	static int timeout = RXE_POOL_TIMEOUT;
+	unsigned long flags;
 	int ret, err = 0;
 	void *xa_ret;
 
@@ -188,7 +194,9 @@  int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
 	/* erase xarray entry to prevent looking up
 	 * the pool elem from its index
 	 */
-	xa_ret = xa_erase(xa, elem->index);
+	xa_lock_irqsave(xa, flags);
+	xa_ret = __xa_erase(xa, elem->index);
+	xa_unlock_irqrestore(xa, flags);
 	WARN_ON(xa_err(xa_ret));
 
 	/* if this is the last call to rxe_put complete the
@@ -249,8 +257,12 @@  int __rxe_put(struct rxe_pool_elem *elem)
 
 void __rxe_finalize(struct rxe_pool_elem *elem)
 {
+	struct xarray *xa = &elem->pool->xa;
+	unsigned long flags;
 	void *xa_ret;
 
-	xa_ret = xa_store(&elem->pool->xa, elem->index, elem, GFP_KERNEL);
+	xa_lock_irqsave(xa, flags);
+	xa_ret = __xa_store(xa, elem->index, elem, GFP_KERNEL);
+	xa_unlock_irqrestore(xa, flags);
 	WARN_ON(xa_err(xa_ret));
 }