diff mbox series

[for-next,v10,07/17] RDMA/rxe: Use kzmalloc/kfree for mca

Message ID 20220131220849.10170-8-rpearsonhpe@gmail.com (mailing list archive)
State Superseded
Headers show
Series Move two object pools to rxe_mcast.c | expand

Commit Message

Bob Pearson Jan. 31, 2022, 10:08 p.m. UTC
Remove rxe_mca (was rxe_mc_elem) from rxe pools and use kzmalloc
and kfree to allocate and free. Use the sequence

    <lookup qp>
    new_mca = kzalloc(sizeof(*new_mca), GFP_KERNEL);
    <spin lock>
    <lookup qp again> /* in case of a race */
    <init new_mca>
    <spin unlock>

instead of GFP_ATOMIC inside of the spinlock. Add an extra reference
to multicast group to protect the pointer in the index that maps
mgid to group.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe.c       |   8 --
 drivers/infiniband/sw/rxe/rxe_mcast.c | 102 +++++++++++++++-----------
 drivers/infiniband/sw/rxe/rxe_pool.c  |   5 --
 drivers/infiniband/sw/rxe/rxe_pool.h  |   1 -
 drivers/infiniband/sw/rxe/rxe_verbs.h |   2 -
 5 files changed, 59 insertions(+), 59 deletions(-)

Comments

kernel test robot Feb. 1, 2022, 1:03 a.m. UTC | #1
Hi Bob,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on v5.17-rc2]
[also build test WARNING on next-20220131]
[cannot apply to rdma/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Bob-Pearson/Move-two-object-pools-to-rxe_mcast-c/20220201-061231
base:    26291c54e111ff6ba87a164d85d4a4e134b7315c
config: ia64-allyesconfig (https://download.01.org/0day-ci/archive/20220201/202202010836.EmoG3Ot8-lkp@intel.com/config)
compiler: ia64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/f9d560658bbbd5a17cc3c62e566cb9bb77697530
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Bob-Pearson/Move-two-object-pools-to-rxe_mcast-c/20220201-061231
        git checkout f9d560658bbbd5a17cc3c62e566cb9bb77697530
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=ia64 SHELL=/bin/bash drivers/infiniband/sw/rxe/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/infiniband/sw/rxe/rxe_mcast.c:57:6: warning: no previous prototype for '__rxe_destroy_mcg' [-Wmissing-prototypes]
      57 | void __rxe_destroy_mcg(struct rxe_mcg *grp)
         |      ^~~~~~~~~~~~~~~~~


vim +/__rxe_destroy_mcg +57 drivers/infiniband/sw/rxe/rxe_mcast.c

    55	
    56	/* caller is holding a ref from lookup and mcg->mcg_lock*/
  > 57	void __rxe_destroy_mcg(struct rxe_mcg *grp)
    58	{
    59		rxe_drop_key(grp);
    60		rxe_drop_ref(grp);
    61	
    62		rxe_mcast_delete(grp->rxe, &grp->mgid);
    63	}
    64	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Jason Gunthorpe Feb. 1, 2022, 2:53 p.m. UTC | #2
On Mon, Jan 31, 2022 at 04:08:40PM -0600, Bob Pearson wrote:
> Remove rxe_mca (was rxe_mc_elem) from rxe pools and use kzmalloc
> and kfree to allocate and free. Use the sequence
> 
>     <lookup qp>
>     new_mca = kzalloc(sizeof(*new_mca), GFP_KERNEL);
>     <spin lock>
>     <lookup qp again> /* in case of a race */
>     <init new_mca>
>     <spin unlock>
> 
> instead of GFP_ATOMIC inside of the spinlock. Add an extra reference
> to multicast group to protect the pointer in the index that maps
> mgid to group.
> 
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>  drivers/infiniband/sw/rxe/rxe.c       |   8 --
>  drivers/infiniband/sw/rxe/rxe_mcast.c | 102 +++++++++++++++-----------
>  drivers/infiniband/sw/rxe/rxe_pool.c  |   5 --
>  drivers/infiniband/sw/rxe/rxe_pool.h  |   1 -
>  drivers/infiniband/sw/rxe/rxe_verbs.h |   2 -
>  5 files changed, 59 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index fab291245366..c55736e441e7 100644
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -29,7 +29,6 @@ void rxe_dealloc(struct ib_device *ib_dev)
>  	rxe_pool_cleanup(&rxe->mr_pool);
>  	rxe_pool_cleanup(&rxe->mw_pool);
>  	rxe_pool_cleanup(&rxe->mc_grp_pool);
> -	rxe_pool_cleanup(&rxe->mc_elem_pool);
>  
>  	if (rxe->tfm)
>  		crypto_free_shash(rxe->tfm);
> @@ -163,15 +162,8 @@ static int rxe_init_pools(struct rxe_dev *rxe)
>  	if (err)
>  		goto err9;
>  
> -	err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM,
> -			    rxe->attr.max_total_mcast_qp_attach);
> -	if (err)
> -		goto err10;
> -
>  	return 0;
>  
> -err10:
> -	rxe_pool_cleanup(&rxe->mc_grp_pool);
>  err9:
>  	rxe_pool_cleanup(&rxe->mw_pool);
>  err8:
> diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
> index 9336295c4ee2..4a5896a225a6 100644
> +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
> @@ -26,30 +26,40 @@ static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
>  }
>  
>  /* caller should hold mc_grp_pool->pool_lock */
> -static struct rxe_mcg *create_grp(struct rxe_dev *rxe,
> -				     struct rxe_pool *pool,
> -				     union ib_gid *mgid)
> +static int __rxe_create_grp(struct rxe_dev *rxe, struct rxe_pool *pool,
> +			    union ib_gid *mgid, struct rxe_mcg **grp_p)
>  {
>  	int err;
>  	struct rxe_mcg *grp;
>  
>  	grp = rxe_alloc_locked(&rxe->mc_grp_pool);
>  	if (!grp)
> -		return ERR_PTR(-ENOMEM);
> +		return -ENOMEM;
> +
> +	err = rxe_mcast_add(rxe, mgid);
> +	if (unlikely(err)) {
> +		rxe_drop_ref(grp);
> +		return err;
> +	}
>  
>  	INIT_LIST_HEAD(&grp->qp_list);
>  	spin_lock_init(&grp->mcg_lock);
>  	grp->rxe = rxe;
> +
> +	rxe_add_ref(grp);
>  	rxe_add_key_locked(grp, mgid);
>  
> -	err = rxe_mcast_add(rxe, mgid);
> -	if (unlikely(err)) {
> -		rxe_drop_key_locked(grp);
> -		rxe_drop_ref(grp);
> -		return ERR_PTR(err);
> -	}
> +	*grp_p = grp;

This should return the struct rxe_mcg or an ERR_PTR not use output
function arguments, like it was before

> +	return 0;
> +}
> +
> +/* caller is holding a ref from lookup and mcg->mcg_lock*/
> +void __rxe_destroy_mcg(struct rxe_mcg *grp)
> +{
> +	rxe_drop_key(grp);
> +	rxe_drop_ref(grp);
>  
> -	return grp;
> +	rxe_mcast_delete(grp->rxe, &grp->mgid);
>  }
>  
>  static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
> @@ -68,10 +78,9 @@ static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
>  	if (grp)
>  		goto done;
>  
> -	grp = create_grp(rxe, pool, mgid);
> -	if (IS_ERR(grp)) {
> +	err = __rxe_create_grp(rxe, pool, mgid, &grp);
> +	if (err) {
>  		write_unlock_bh(&pool->pool_lock);
> -		err = PTR_ERR(grp);
>  		return err;

This should return the struct rxe_mcg or ERR_PTR too

> @@ -126,7 +143,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
>  				   union ib_gid *mgid)
>  {
>  	struct rxe_mcg *grp;
> -	struct rxe_mca *elem, *tmp;
> +	struct rxe_mca *mca, *tmp;
>  
>  	grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
>  	if (!grp)
> @@ -134,33 +151,30 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
>  
>  	spin_lock_bh(&grp->mcg_lock);
>  
> +	list_for_each_entry_safe(mca, tmp, &grp->qp_list, qp_list) {
> +		if (mca->qp == qp) {
> +			list_del(&mca->qp_list);
>  			grp->num_qp--;
> +			if (grp->num_qp <= 0)
> +				__rxe_destroy_mcg(grp);
>  			atomic_dec(&qp->mcg_num);
>  
>  			spin_unlock_bh(&grp->mcg_lock);
> +			rxe_drop_ref(grp);

Wwhere did this extra ref come from?

The grp was threaded on a linked list by rxe_attach_mcast, but that
also dropped the extra reference.

The reference should have followed the pointer into the linked list,
then it could be unput here when unthreading it from the linked list.

To me this still looks like there should be exactly one ref, the
rbtree should be removed inside the release function when the ref goes
to zero (under the pool_lock) and doesn't otherwise hold a ref

The linked list on the mca should hold the only ref and when
all the linked list is put back the grp can be released, no need for
qp_num as well.

Jason
kernel test robot Feb. 1, 2022, 6:20 p.m. UTC | #3
Hi Bob,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on v5.17-rc2]
[cannot apply to rdma/for-next next-20220201]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Bob-Pearson/Move-two-object-pools-to-rxe_mcast-c/20220201-061231
base:    26291c54e111ff6ba87a164d85d4a4e134b7315c
config: arm-randconfig-r001-20220131 (https://download.01.org/0day-ci/archive/20220202/202202020207.bxuWR3xX-lkp@intel.com/config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 6b1e844b69f15bb7dffaf9365cd2b355d2eb7579)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm cross compiling tool for clang build
        # apt-get install binutils-arm-linux-gnueabi
        # https://github.com/0day-ci/linux/commit/f9d560658bbbd5a17cc3c62e566cb9bb77697530
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Bob-Pearson/Move-two-object-pools-to-rxe_mcast-c/20220201-061231
        git checkout f9d560658bbbd5a17cc3c62e566cb9bb77697530
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash drivers/infiniband/sw/rxe/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/infiniband/sw/rxe/rxe_mcast.c:57:6: warning: no previous prototype for function '__rxe_destroy_mcg' [-Wmissing-prototypes]
   void __rxe_destroy_mcg(struct rxe_mcg *grp)
        ^
   drivers/infiniband/sw/rxe/rxe_mcast.c:57:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __rxe_destroy_mcg(struct rxe_mcg *grp)
   ^
   static 
   1 warning generated.


vim +/__rxe_destroy_mcg +57 drivers/infiniband/sw/rxe/rxe_mcast.c

    55	
    56	/* caller is holding a ref from lookup and mcg->mcg_lock*/
  > 57	void __rxe_destroy_mcg(struct rxe_mcg *grp)
    58	{
    59		rxe_drop_key(grp);
    60		rxe_drop_ref(grp);
    61	
    62		rxe_mcast_delete(grp->rxe, &grp->mgid);
    63	}
    64	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Bob Pearson Feb. 1, 2022, 8 p.m. UTC | #4
On 2/1/22 08:53, Jason Gunthorpe wrote:
> On Mon, Jan 31, 2022 at 04:08:40PM -0600, Bob Pearson wrote:
>> Remove rxe_mca (was rxe_mc_elem) from rxe pools and use kzmalloc
>> and kfree to allocate and free. Use the sequence
>>
>>     <lookup qp>
>>     new_mca = kzalloc(sizeof(*new_mca), GFP_KERNEL);
>>     <spin lock>
>>     <lookup qp again> /* in case of a race */
>>     <init new_mca>
>>     <spin unlock>
>>
>> instead of GFP_ATOMIC inside of the spinlock. Add an extra reference
>> to multicast group to protect the pointer in the index that maps
>> mgid to group.
>>
>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>>  drivers/infiniband/sw/rxe/rxe.c       |   8 --
>>  drivers/infiniband/sw/rxe/rxe_mcast.c | 102 +++++++++++++++-----------
>>  drivers/infiniband/sw/rxe/rxe_pool.c  |   5 --
>>  drivers/infiniband/sw/rxe/rxe_pool.h  |   1 -
>>  drivers/infiniband/sw/rxe/rxe_verbs.h |   2 -
>>  5 files changed, 59 insertions(+), 59 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>> index fab291245366..c55736e441e7 100644
>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>> @@ -29,7 +29,6 @@ void rxe_dealloc(struct ib_device *ib_dev)
>>  	rxe_pool_cleanup(&rxe->mr_pool);
>>  	rxe_pool_cleanup(&rxe->mw_pool);
>>  	rxe_pool_cleanup(&rxe->mc_grp_pool);
>> -	rxe_pool_cleanup(&rxe->mc_elem_pool);
>>  
>>  	if (rxe->tfm)
>>  		crypto_free_shash(rxe->tfm);
>> @@ -163,15 +162,8 @@ static int rxe_init_pools(struct rxe_dev *rxe)
>>  	if (err)
>>  		goto err9;
>>  
>> -	err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM,
>> -			    rxe->attr.max_total_mcast_qp_attach);
>> -	if (err)
>> -		goto err10;
>> -
>>  	return 0;
>>  
>> -err10:
>> -	rxe_pool_cleanup(&rxe->mc_grp_pool);
>>  err9:
>>  	rxe_pool_cleanup(&rxe->mw_pool);
>>  err8:
>> diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
>> index 9336295c4ee2..4a5896a225a6 100644
>> +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
>> @@ -26,30 +26,40 @@ static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
>>  }
>>  
>>  /* caller should hold mc_grp_pool->pool_lock */
>> -static struct rxe_mcg *create_grp(struct rxe_dev *rxe,
>> -				     struct rxe_pool *pool,
>> -				     union ib_gid *mgid)
>> +static int __rxe_create_grp(struct rxe_dev *rxe, struct rxe_pool *pool,
>> +			    union ib_gid *mgid, struct rxe_mcg **grp_p)
>>  {
>>  	int err;
>>  	struct rxe_mcg *grp;
>>  
>>  	grp = rxe_alloc_locked(&rxe->mc_grp_pool);
>>  	if (!grp)
>> -		return ERR_PTR(-ENOMEM);
>> +		return -ENOMEM;
>> +
>> +	err = rxe_mcast_add(rxe, mgid);
>> +	if (unlikely(err)) {
>> +		rxe_drop_ref(grp);
>> +		return err;
>> +	}
>>  
>>  	INIT_LIST_HEAD(&grp->qp_list);
>>  	spin_lock_init(&grp->mcg_lock);
>>  	grp->rxe = rxe;
>> +
>> +	rxe_add_ref(grp);
>>  	rxe_add_key_locked(grp, mgid);
>>  
>> -	err = rxe_mcast_add(rxe, mgid);
>> -	if (unlikely(err)) {
>> -		rxe_drop_key_locked(grp);
>> -		rxe_drop_ref(grp);
>> -		return ERR_PTR(err);
>> -	}
>> +	*grp_p = grp;
> 
> This should return the struct rxe_mcg or an ERR_PTR not use output
> function arguments, like it was before
> 
>> +	return 0;
>> +}
>> +
>> +/* caller is holding a ref from lookup and mcg->mcg_lock*/
>> +void __rxe_destroy_mcg(struct rxe_mcg *grp)
>> +{
>> +	rxe_drop_key(grp);
>> +	rxe_drop_ref(grp);
>>  
>> -	return grp;
>> +	rxe_mcast_delete(grp->rxe, &grp->mgid);
>>  }
>>  
>>  static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
>> @@ -68,10 +78,9 @@ static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
>>  	if (grp)
>>  		goto done;
>>  
>> -	grp = create_grp(rxe, pool, mgid);
>> -	if (IS_ERR(grp)) {
>> +	err = __rxe_create_grp(rxe, pool, mgid, &grp);
>> +	if (err) {
>>  		write_unlock_bh(&pool->pool_lock);
>> -		err = PTR_ERR(grp);
>>  		return err;
> 
> This should return the struct rxe_mcg or ERR_PTR too
> 
>> @@ -126,7 +143,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
>>  				   union ib_gid *mgid)
>>  {
>>  	struct rxe_mcg *grp;
>> -	struct rxe_mca *elem, *tmp;
>> +	struct rxe_mca *mca, *tmp;
>>  
>>  	grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
>>  	if (!grp)
>> @@ -134,33 +151,30 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
>>  
>>  	spin_lock_bh(&grp->mcg_lock);
>>  
>> +	list_for_each_entry_safe(mca, tmp, &grp->qp_list, qp_list) {
>> +		if (mca->qp == qp) {
>> +			list_del(&mca->qp_list);
>>  			grp->num_qp--;
>> +			if (grp->num_qp <= 0)
>> +				__rxe_destroy_mcg(grp);
>>  			atomic_dec(&qp->mcg_num);
>>  
>>  			spin_unlock_bh(&grp->mcg_lock);
>> +			rxe_drop_ref(grp);
> 
> Wwhere did this extra ref come from?
> 
> The grp was threaded on a linked list by rxe_attach_mcast, but that
> also dropped the extra reference.
> 
> The reference should have followed the pointer into the linked list,
> then it could be unput here when unthreading it from the linked list.
> 
> To me this still looks like there should be exactly one ref, the
> rbtree should be removed inside the release function when the ref goes
> to zero (under the pool_lock) and doesn't otherwise hold a ref
> 
> The linked list on the mca should hold the only ref and when
> all the linked list is put back the grp can be released, no need for
> qp_num as well.
> 
> Jason

Here is a rough picture of what is going on. When not active there are pointers to
each mcg in the red-black tree (from a parent of the rb_node contained in mcg) and
if there are attached qp's there are a pair of pointers from the head and tail of the
linked list else none. When active in either verbs or packet processing code there
is a pointer to mcg held in a local variable by the routine obtained from looking
up the mcg in the tree.

                                  +-----+
                                  | rxe |
                                  +-----+
                                     |
                                   +-v-+
                           +-------|rb |-------+
                           |       +---+       |
                           |                 +-v-+
                           |                 |rb |
                        +--v--+              +---+
                        |(rb) |
       +--------------->| mcg |<---------------+
       |                +-----+                |
       |                                       |
       |    +-----+     +-----+     +-----+    |
       +--->| mca |<--->| mca |<--->| mca |<---+
            +-----+     +-----+     +-----+
               |           |           |
               |           |           |
            +--v--+     +--v--+     +--v--+
            | qp  |     | qp  |     | qp  |
            +-----+     +-----+     +-----+

as currently written the local variable has a kref obtained from the kref_get in
rxe_lookup_mcg or the kref_init in rxe_init_mcg if it is newly created. This ref is
dropped when the local variable goes out of scope. To protect the mcg when it is
inactive at least one more ref is required. I take an additional ref in rxe_get_mcg
if the mcg is created to protect the pointer in the red-black tree. This persists
for the lifetime of the object until it is destroyed when it is removed from the tree
and has the longest scope. This is enough to keep the object alive (it works fine BTW.)
It is also possible to take ref's representing the pointers in the list but they
wouldn't add anything.

On the other point. Is it standard practice to user ERRPTRs in the kernel rather than
return arguments? I seem to have seen both styles used but it may have been my imagination. I don't have any preference here but sometimes choose one or the other
in parallel flows to make comparable routines in the flows have similar interfaces.

Bob
Jason Gunthorpe Feb. 1, 2022, 8:14 p.m. UTC | #5
On Tue, Feb 01, 2022 at 02:00:09PM -0600, Bob Pearson wrote:


> as currently written the local variable has a kref obtained from the kref_get in
> rxe_lookup_mcg or the kref_init in rxe_init_mcg if it is newly created. This ref is
> dropped when the local variable goes out of scope. To protect the mcg when it is
> inactive at least one more ref is required. I take an additional ref in rxe_get_mcg
> if the mcg is created to protect the pointer in the red-black tree. This persists
> for the lifetime of the object until it is destroyed when it is removed from the tree
> and has the longest scope. This is enough to keep the object alive (it works fine BTW.)
> It is also possible to take ref's representing the pointers in the list but they
> wouldn't add anything.

I think I got it upside down, but OK this works for me. What was
kbuild complaining about?

> On the other point. Is it standard practice to user ERRPTRs in the
> kernel rather than return arguments? I seem to have seen both styles
> used but it may have been my imagination. I don't have any
> preference here but sometimes choose one or the other in parallel
> flows to make comparable routines in the flows have similar
> interfaces.

Always prefer errptrs.

Jason
Bob Pearson Feb. 1, 2022, 8:30 p.m. UTC | #6
On 2/1/22 14:14, Jason Gunthorpe wrote:
> On Tue, Feb 01, 2022 at 02:00:09PM -0600, Bob Pearson wrote:
> 
> 
>> as currently written the local variable has a kref obtained from the kref_get in
>> rxe_lookup_mcg or the kref_init in rxe_init_mcg if it is newly created. This ref is
>> dropped when the local variable goes out of scope. To protect the mcg when it is
>> inactive at least one more ref is required. I take an additional ref in rxe_get_mcg
>> if the mcg is created to protect the pointer in the red-black tree. This persists
>> for the lifetime of the object until it is destroyed when it is removed from the tree
>> and has the longest scope. This is enough to keep the object alive (it works fine BTW.)
>> It is also possible to take ref's representing the pointers in the list but they
>> wouldn't add anything.
> 
> I think I got it upside down, but OK this works for me. What was
> kbuild complaining about?
> 
>> On the other point. Is it standard practice to user ERRPTRs in the
>> kernel rather than return arguments? I seem to have seen both styles
>> used but it may have been my imagination. I don't have any
>> preference here but sometimes choose one or the other in parallel
>> flows to make comparable routines in the flows have similar
>> interfaces.
> 
> Always prefer errptrs.
> 
> Jason

this API is a little different than most of the verbs APIs where typically we have

obj = xx_create_obj()
	takes a reference at kref_init() and doesn't drop it on the non error path.
	returns obj.

xx_other_code_using_obj(obj)
	uses obj holding ref

xx_destroy_obj(obj)
	drops the ref to obj

here the object is a local variable in xx_mcast_attach() and nothing is returned.

For InfiniBand the create and destroy mcast group function is triggered by MADs
from some central mcast server. For RoCEv2 that doesn't happen and these APIs
don't exist.

	
a missing static declaration at line 262. If you want me to I can fix it and resend.
Do I really have to mint new version?

Bob
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index fab291245366..c55736e441e7 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -29,7 +29,6 @@  void rxe_dealloc(struct ib_device *ib_dev)
 	rxe_pool_cleanup(&rxe->mr_pool);
 	rxe_pool_cleanup(&rxe->mw_pool);
 	rxe_pool_cleanup(&rxe->mc_grp_pool);
-	rxe_pool_cleanup(&rxe->mc_elem_pool);
 
 	if (rxe->tfm)
 		crypto_free_shash(rxe->tfm);
@@ -163,15 +162,8 @@  static int rxe_init_pools(struct rxe_dev *rxe)
 	if (err)
 		goto err9;
 
-	err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM,
-			    rxe->attr.max_total_mcast_qp_attach);
-	if (err)
-		goto err10;
-
 	return 0;
 
-err10:
-	rxe_pool_cleanup(&rxe->mc_grp_pool);
 err9:
 	rxe_pool_cleanup(&rxe->mw_pool);
 err8:
diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
index 9336295c4ee2..4a5896a225a6 100644
--- a/drivers/infiniband/sw/rxe/rxe_mcast.c
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -26,30 +26,40 @@  static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
 }
 
 /* caller should hold mc_grp_pool->pool_lock */
-static struct rxe_mcg *create_grp(struct rxe_dev *rxe,
-				     struct rxe_pool *pool,
-				     union ib_gid *mgid)
+static int __rxe_create_grp(struct rxe_dev *rxe, struct rxe_pool *pool,
+			    union ib_gid *mgid, struct rxe_mcg **grp_p)
 {
 	int err;
 	struct rxe_mcg *grp;
 
 	grp = rxe_alloc_locked(&rxe->mc_grp_pool);
 	if (!grp)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
+
+	err = rxe_mcast_add(rxe, mgid);
+	if (unlikely(err)) {
+		rxe_drop_ref(grp);
+		return err;
+	}
 
 	INIT_LIST_HEAD(&grp->qp_list);
 	spin_lock_init(&grp->mcg_lock);
 	grp->rxe = rxe;
+
+	rxe_add_ref(grp);
 	rxe_add_key_locked(grp, mgid);
 
-	err = rxe_mcast_add(rxe, mgid);
-	if (unlikely(err)) {
-		rxe_drop_key_locked(grp);
-		rxe_drop_ref(grp);
-		return ERR_PTR(err);
-	}
+	*grp_p = grp;
+	return 0;
+}
+
+/* caller is holding a ref from lookup and mcg->mcg_lock*/
+void __rxe_destroy_mcg(struct rxe_mcg *grp)
+{
+	rxe_drop_key(grp);
+	rxe_drop_ref(grp);
 
-	return grp;
+	rxe_mcast_delete(grp->rxe, &grp->mgid);
 }
 
 static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
@@ -68,10 +78,9 @@  static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
 	if (grp)
 		goto done;
 
-	grp = create_grp(rxe, pool, mgid);
-	if (IS_ERR(grp)) {
+	err = __rxe_create_grp(rxe, pool, mgid, &grp);
+	if (err) {
 		write_unlock_bh(&pool->pool_lock);
-		err = PTR_ERR(grp);
 		return err;
 	}
 
@@ -85,36 +94,44 @@  static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
 			   struct rxe_mcg *grp)
 {
 	int err;
-	struct rxe_mca *elem;
+	struct rxe_mca *mca, *new_mca;
 
-	/* check to see of the qp is already a member of the group */
+	/* check to see if the qp is already a member of the group */
 	spin_lock_bh(&grp->mcg_lock);
-	list_for_each_entry(elem, &grp->qp_list, qp_list) {
-		if (elem->qp == qp) {
+	list_for_each_entry(mca, &grp->qp_list, qp_list) {
+		if (mca->qp == qp) {
+			spin_unlock_bh(&grp->mcg_lock);
+			return 0;
+		}
+	}
+	spin_unlock_bh(&grp->mcg_lock);
+
+	/* speculative alloc new mca without using GFP_ATOMIC */
+	new_mca = kzalloc(sizeof(*mca), GFP_KERNEL);
+	if (!new_mca)
+		return -ENOMEM;
+
+	spin_lock_bh(&grp->mcg_lock);
+	/* re-check to see if someone else just attached qp */
+	list_for_each_entry(mca, &grp->qp_list, qp_list) {
+		if (mca->qp == qp) {
+			kfree(new_mca);
 			err = 0;
 			goto out;
 		}
 	}
+	mca = new_mca;
 
 	if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) {
 		err = -ENOMEM;
 		goto out;
 	}
 
-	elem = rxe_alloc_locked(&rxe->mc_elem_pool);
-	if (!elem) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	/* each qp holds a ref on the grp */
-	rxe_add_ref(grp);
-
 	grp->num_qp++;
-	elem->qp = qp;
+	mca->qp = qp;
 	atomic_inc(&qp->mcg_num);
 
-	list_add(&elem->qp_list, &grp->qp_list);
+	list_add(&mca->qp_list, &grp->qp_list);
 
 	err = 0;
 out:
@@ -126,7 +143,7 @@  static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
 				   union ib_gid *mgid)
 {
 	struct rxe_mcg *grp;
-	struct rxe_mca *elem, *tmp;
+	struct rxe_mca *mca, *tmp;
 
 	grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
 	if (!grp)
@@ -134,33 +151,30 @@  static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
 
 	spin_lock_bh(&grp->mcg_lock);
 
-	list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) {
-		if (elem->qp == qp) {
-			list_del(&elem->qp_list);
+	list_for_each_entry_safe(mca, tmp, &grp->qp_list, qp_list) {
+		if (mca->qp == qp) {
+			list_del(&mca->qp_list);
 			grp->num_qp--;
+			if (grp->num_qp <= 0)
+				__rxe_destroy_mcg(grp);
 			atomic_dec(&qp->mcg_num);
 
 			spin_unlock_bh(&grp->mcg_lock);
-			rxe_drop_ref(elem);
-			rxe_drop_ref(grp);	/* ref held by QP */
-			rxe_drop_ref(grp);	/* ref from get_key */
+			rxe_drop_ref(grp);
+			kfree(mca);
 			return 0;
 		}
 	}
 
 	spin_unlock_bh(&grp->mcg_lock);
-	rxe_drop_ref(grp);			/* ref from get_key */
+	rxe_drop_ref(grp);
 err1:
 	return -EINVAL;
 }
 
 void rxe_mc_cleanup(struct rxe_pool_elem *elem)
 {
-	struct rxe_mcg *grp = container_of(elem, typeof(*grp), elem);
-	struct rxe_dev *rxe = grp->rxe;
-
-	rxe_drop_key(grp);
-	rxe_mcast_delete(rxe, &grp->mgid);
+	/* nothing left to do */
 }
 
 int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
@@ -170,13 +184,15 @@  int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
 	struct rxe_qp *qp = to_rqp(ibqp);
 	struct rxe_mcg *grp;
 
-	/* takes a ref on grp if successful */
 	err = rxe_mcast_get_grp(rxe, mgid, &grp);
 	if (err)
 		return err;
 
 	err = rxe_mcast_add_grp_elem(rxe, qp, grp);
 
+	if (grp->num_qp == 0)
+		__rxe_destroy_mcg(grp);
+
 	rxe_drop_ref(grp);
 	return err;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 63c594173565..a6756aa93e2b 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -90,11 +90,6 @@  static const struct rxe_type_info {
 		.key_offset	= offsetof(struct rxe_mcg, mgid),
 		.key_size	= sizeof(union ib_gid),
 	},
-	[RXE_TYPE_MC_ELEM] = {
-		.name		= "rxe-mc_elem",
-		.size		= sizeof(struct rxe_mca),
-		.elem_offset	= offsetof(struct rxe_mca, elem),
-	},
 };
 
 static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index 214279310f4d..511f81554fd1 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -23,7 +23,6 @@  enum rxe_elem_type {
 	RXE_TYPE_MR,
 	RXE_TYPE_MW,
 	RXE_TYPE_MC_GRP,
-	RXE_TYPE_MC_ELEM,
 	RXE_NUM_TYPES,		/* keep me last */
 };
 
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 55f8ed2bc621..02745d51c163 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -363,7 +363,6 @@  struct rxe_mcg {
 };
 
 struct rxe_mca {
-	struct rxe_pool_elem	elem;
 	struct list_head	qp_list;
 	struct rxe_qp		*qp;
 };
@@ -397,7 +396,6 @@  struct rxe_dev {
 	struct rxe_pool		mr_pool;
 	struct rxe_pool		mw_pool;
 	struct rxe_pool		mc_grp_pool;
-	struct rxe_pool		mc_elem_pool;
 
 	spinlock_t		pending_lock; /* guard pending_mmaps */
 	struct list_head	pending_mmaps;