diff mbox series

[for-next,1/6] RDMA/rxe: Make rxe_alloc() take pool lock

Message ID 20211010235931.24042-2-rpearsonhpe@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series RDMA/rxe: Fix potential races | expand

Commit Message

Bob Pearson Oct. 10, 2021, 11:59 p.m. UTC
In rxe there are two separate pool APIs for creating a new object
rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
Make rxe_alloc() take the pool lock which is in line with the other
APIs in the library.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

Comments

Jason Gunthorpe Oct. 20, 2021, 11:16 p.m. UTC | #1
On Sun, Oct 10, 2021 at 06:59:26PM -0500, Bob Pearson wrote:
> In rxe there are two separate pool APIs for creating a new object
> rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
> Make rxe_alloc() take the pool lock which is in line with the other
> APIs in the library.
> 
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>  drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
>  1 file changed, 4 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> index ffa8420b4765..7a288ebacceb 100644
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> @@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
>  
>  void *rxe_alloc(struct rxe_pool *pool)
>  {
> -	struct rxe_type_info *info = &rxe_type_info[pool->type];
> -	struct rxe_pool_entry *elem;
> +	unsigned long flags;
>  	u8 *obj;
>  
> -	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
> -		goto out_cnt;
> -
> -	obj = kzalloc(info->size, GFP_KERNEL);
> -	if (!obj)
> -		goto out_cnt;
> -
> -	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
> -
> -	elem->pool = pool;
> -	kref_init(&elem->ref_cnt);
> +	write_lock_irqsave(&pool->pool_lock, flags);
> +	obj = rxe_alloc_locked(pool);
> +	write_unlock_irqrestore(&pool->pool_lock, flags);

But why? This just makes a GFP_KERNEL allocation into a GFP_ATOMIC
allocation, which is bad.

Jason
Bob Pearson Oct. 21, 2021, 5:46 p.m. UTC | #2
On 10/20/21 6:16 PM, Jason Gunthorpe wrote:
> On Sun, Oct 10, 2021 at 06:59:26PM -0500, Bob Pearson wrote:
>> In rxe there are two separate pool APIs for creating a new object
>> rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
>> Make rxe_alloc() take the pool lock which is in line with the other
>> APIs in the library.
>>
>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>>  drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
>>  1 file changed, 4 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
>> index ffa8420b4765..7a288ebacceb 100644
>> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
>> @@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
>>  
>>  void *rxe_alloc(struct rxe_pool *pool)
>>  {
>> -	struct rxe_type_info *info = &rxe_type_info[pool->type];
>> -	struct rxe_pool_entry *elem;
>> +	unsigned long flags;
>>  	u8 *obj;
>>  
>> -	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>> -		goto out_cnt;
>> -
>> -	obj = kzalloc(info->size, GFP_KERNEL);
>> -	if (!obj)
>> -		goto out_cnt;
>> -
>> -	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
>> -
>> -	elem->pool = pool;
>> -	kref_init(&elem->ref_cnt);
>> +	write_lock_irqsave(&pool->pool_lock, flags);
>> +	obj = rxe_alloc_locked(pool);
>> +	write_unlock_irqrestore(&pool->pool_lock, flags);
> 
> But why? This just makes a GFP_KERNEL allocation into a GFP_ATOMIC
> allocation, which is bad.
> 
> Jason
> 
how bad? It only has to happen once in the driver for mcast group elements where
currently I have (to avoid the race when two QPs try to join the same mcast grp
on different CPUs at the same time)

	spin_lock()
	grp = rxe_get_key_locked(pool, mgid)
	if !grp
		grp = rxe_alloc_locked(pool)
	spin_unlock()

Here the kzalloc has to be GFP_ATOMIC. But I could write after fixing things to
move the kzalloc out of the lock in rxe_alloc().

	newgrp = rxe_alloc(pool)	/* using GFP_KERNEL */
	spin_lock()
	grp = rxe_get_key_locked(pool, mgid)
	if (grp)
		kfree(newgrp)
	else {
		grp = newgrp
		<set key in grp>
	}
	spin_unlock()

A typical use case would be for a bunch of QPs to join a mcast group and most of the
time the key lookup succeeds. The trade off is between extra malloc/free and occasional
bad behavior from GFP_ATOMIC.

The majority of uses for rxe_alloc() do not have these issues and I can move the kzalloc
outside of the lock and use GFP_KERNEL.

Bob
Jason Gunthorpe Oct. 25, 2021, 12:43 p.m. UTC | #3
On Thu, Oct 21, 2021 at 12:46:50PM -0500, Bob Pearson wrote:
> On 10/20/21 6:16 PM, Jason Gunthorpe wrote:
> > On Sun, Oct 10, 2021 at 06:59:26PM -0500, Bob Pearson wrote:
> >> In rxe there are two separate pool APIs for creating a new object
> >> rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
> >> Make rxe_alloc() take the pool lock which is in line with the other
> >> APIs in the library.
> >>
> >> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> >>  drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
> >>  1 file changed, 4 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> >> index ffa8420b4765..7a288ebacceb 100644
> >> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> >> @@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
> >>  
> >>  void *rxe_alloc(struct rxe_pool *pool)
> >>  {
> >> -	struct rxe_type_info *info = &rxe_type_info[pool->type];
> >> -	struct rxe_pool_entry *elem;
> >> +	unsigned long flags;
> >>  	u8 *obj;
> >>  
> >> -	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
> >> -		goto out_cnt;
> >> -
> >> -	obj = kzalloc(info->size, GFP_KERNEL);
> >> -	if (!obj)
> >> -		goto out_cnt;
> >> -
> >> -	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
> >> -
> >> -	elem->pool = pool;
> >> -	kref_init(&elem->ref_cnt);
> >> +	write_lock_irqsave(&pool->pool_lock, flags);
> >> +	obj = rxe_alloc_locked(pool);
> >> +	write_unlock_irqrestore(&pool->pool_lock, flags);
> > 
> > But why? This just makes a GFP_KERNEL allocation into a GFP_ATOMIC
> > allocation, which is bad.
> > 
> > Jason
> > 
> how bad?

Quite bad..

> It only has to happen once in the driver for mcast group elements
> where currently I have (to avoid the race when two QPs try to join
> the same mcast grp on different CPUs at the same time)
> 
> 	spin_lock()
> 	grp = rxe_get_key_locked(pool, mgid)
> 	if !grp
> 		grp = rxe_alloc_locked(pool)
> 	spin_unlock()

When you have xarray the general pattern is:

old = xa_load(mgid)
if (old
   return old

new = kzalloc()
old = xa_cmpxchg(mgid, NULL, new)
if (old)
     kfree(new)
     return old;
return new;

There are several examples of this with various locking patterns

Jason
Bob Pearson Oct. 25, 2021, 6:48 p.m. UTC | #4
This looks useful but I had given up using xarrays to hold multicast
groups because the mgids are 128 bits. The ib_attach_mcast() verb just
passes in the QP and MGID and I couldn't think of an efficient way to
reduce the MGID to a 32 bit non-sparse index. The xarray documentation
advises against hashing the object to get an index. If Linux hands out
mcast group IDs sequentially it may be OK to just use the flag and
scope bits concatenated with the low order 24 bits of the group number
but one would have to deal with (maybe never happen) collisions. If
this worked I could get rid of all the RB trees and make everything be
xarrays.

Bob

On Mon, Oct 25, 2021 at 7:43 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Oct 21, 2021 at 12:46:50PM -0500, Bob Pearson wrote:
> > On 10/20/21 6:16 PM, Jason Gunthorpe wrote:
> > > On Sun, Oct 10, 2021 at 06:59:26PM -0500, Bob Pearson wrote:
> > >> In rxe there are two separate pool APIs for creating a new object
> > >> rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
> > >> Make rxe_alloc() take the pool lock which is in line with the other
> > >> APIs in the library.
> > >>
> > >> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> > >>  drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
> > >>  1 file changed, 4 insertions(+), 17 deletions(-)
> > >>
> > >> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> > >> index ffa8420b4765..7a288ebacceb 100644
> > >> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> > >> @@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
> > >>
> > >>  void *rxe_alloc(struct rxe_pool *pool)
> > >>  {
> > >> -  struct rxe_type_info *info = &rxe_type_info[pool->type];
> > >> -  struct rxe_pool_entry *elem;
> > >> +  unsigned long flags;
> > >>    u8 *obj;
> > >>
> > >> -  if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
> > >> -          goto out_cnt;
> > >> -
> > >> -  obj = kzalloc(info->size, GFP_KERNEL);
> > >> -  if (!obj)
> > >> -          goto out_cnt;
> > >> -
> > >> -  elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
> > >> -
> > >> -  elem->pool = pool;
> > >> -  kref_init(&elem->ref_cnt);
> > >> +  write_lock_irqsave(&pool->pool_lock, flags);
> > >> +  obj = rxe_alloc_locked(pool);
> > >> +  write_unlock_irqrestore(&pool->pool_lock, flags);
> > >
> > > But why? This just makes a GFP_KERNEL allocation into a GFP_ATOMIC
> > > allocation, which is bad.
> > >
> > > Jason
> > >
> > how bad?
>
> Quite bad..
>
> > It only has to happen once in the driver for mcast group elements
> > where currently I have (to avoid the race when two QPs try to join
> > the same mcast grp on different CPUs at the same time)
> >
> >       spin_lock()
> >       grp = rxe_get_key_locked(pool, mgid)
> >       if !grp
> >               grp = rxe_alloc_locked(pool)
> >       spin_unlock()
>
> When you have xarray the general pattern is:
>
> old = xa_load(mgid)
> if (old
>    return old
>
> new = kzalloc()
> old = xa_cmpxchg(mgid, NULL, new)
> if (old)
>      kfree(new)
>      return old;
> return new;
>
> There are several examples of this with various locking patterns
>
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index ffa8420b4765..7a288ebacceb 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -352,27 +352,14 @@  void *rxe_alloc_locked(struct rxe_pool *pool)
 
 void *rxe_alloc(struct rxe_pool *pool)
 {
-	struct rxe_type_info *info = &rxe_type_info[pool->type];
-	struct rxe_pool_entry *elem;
+	unsigned long flags;
 	u8 *obj;
 
-	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
-		goto out_cnt;
-
-	obj = kzalloc(info->size, GFP_KERNEL);
-	if (!obj)
-		goto out_cnt;
-
-	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
-
-	elem->pool = pool;
-	kref_init(&elem->ref_cnt);
+	write_lock_irqsave(&pool->pool_lock, flags);
+	obj = rxe_alloc_locked(pool);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
 
 	return obj;
-
-out_cnt:
-	atomic_dec(&pool->num_elem);
-	return NULL;
 }
 
 int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)