diff mbox series

[PATCHv5,for-rc1,v5,4/8] RDMA/rxe: Implement dellink in rxe

Message ID 20230428093914.2121131-5-yanjun.zhu@intel.com (mailing list archive)
State Superseded
Headers show
Series Fix the problem that rxe can not work in net namespace | expand

Commit Message

Zhu Yanjun April 28, 2023, 9:39 a.m. UTC
From: Zhu Yanjun <yanjun.zhu@linux.dev>

When running "rdma link del" command, dellink function will be called.
If the sock refcnt is greater than the refcnt needed for udp tunnel,
the sock refcnt will be decreased by 1.

If equal, the last rdma link is removed. The udp tunnel will be
destroyed.

Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
 drivers/infiniband/sw/rxe/rxe.c     | 12 +++++++++++-
 drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
 drivers/infiniband/sw/rxe/rxe_net.h |  1 +
 3 files changed, 27 insertions(+), 3 deletions(-)

Comments

Bob Pearson June 20, 2023, 8:21 p.m. UTC | #1
On 4/28/23 04:39, Zhu Yanjun wrote:
> From: Zhu Yanjun <yanjun.zhu@linux.dev>
> 
> When running "rdma link del" command, dellink function will be called.
> If the sock refcnt is greater than the refcnt needed for udp tunnel,
> the sock refcnt will be decreased by 1.
> 
> If equal, the last rdma link is removed. The udp tunnel will be
> destroyed.
> 
> Tested-by: Rain River <rain.1986.08.12@gmail.com>
> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
> ---
>  drivers/infiniband/sw/rxe/rxe.c     | 12 +++++++++++-
>  drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>  drivers/infiniband/sw/rxe/rxe_net.h |  1 +
>  3 files changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 0ce6adb43cfc..ebfabc6d6b76 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>  /* called by ifc layer to create new rxe device.
>   * The caller should allocate memory for rxe by calling ib_alloc_device.
>   */
> +static struct rdma_link_ops rxe_link_ops;
>  int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>  {
>  	rxe_init(rxe);
>  	rxe_set_mtu(rxe, mtu);
> +	rxe->ib_dev.link_ops = &rxe_link_ops;
>  
>  	return rxe_register_device(rxe, ibdev_name);
>  }
> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>  	return err;
>  }
>  
> -struct rdma_link_ops rxe_link_ops = {
> +static int rxe_dellink(struct ib_device *dev)
> +{
> +	rxe_net_del(dev);
> +
> +	return 0;
> +}
> +
> +static struct rdma_link_ops rxe_link_ops = {
>  	.type = "rxe",
>  	.newlink = rxe_newlink,
> +	.dellink = rxe_dellink,
>  };
>  
>  static int __init rxe_module_init(void)
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index 3ca92e062800..4cc7de7b115b 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>  	return 0;
>  }
>  
> +#define SK_REF_FOR_TUNNEL	2
> +void rxe_net_del(struct ib_device *dev)
> +{
> +	if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
> +		__sock_put(recv_sockets.sk6->sk);
> +	else
> +		rxe_release_udp_tunnel(recv_sockets.sk6);
> +
> +	if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
> +		__sock_put(recv_sockets.sk4->sk);
> +	else
> +		rxe_release_udp_tunnel(recv_sockets.sk4);
> +}
> +#undef SK_REF_FOR_TUNNEL
> +
>  static void rxe_port_event(struct rxe_dev *rxe,
>  			   enum ib_event_type event)
>  {
> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>  
>  void rxe_net_exit(void)
>  {
> -	rxe_release_udp_tunnel(recv_sockets.sk6);
> -	rxe_release_udp_tunnel(recv_sockets.sk4);
>  	unregister_netdevice_notifier(&rxe_net_notifier);
>  }
These calls are moved to rxe_net_del which is called by an explicit unlink command.
But if rxe_net_init fails and returns an error code this will never happen.
This will result in leaking resources.

Bob
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
> index a222c3eeae12..f48f22f3353b 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.h
> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>  };
>  
>  int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
> +void rxe_net_del(struct ib_device *dev);
>  
>  int rxe_register_notifier(void);
>  int rxe_net_init(void);
Zhu Yanjun June 21, 2023, 2:13 a.m. UTC | #2
在 2023/6/21 4:21, Bob Pearson 写道:
> On 4/28/23 04:39, Zhu Yanjun wrote:
>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>
>> When running "rdma link del" command, dellink function will be called.
>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>> the sock refcnt will be decreased by 1.
>>
>> If equal, the last rdma link is removed. The udp tunnel will be
>> destroyed.
>>
>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>> ---
>>   drivers/infiniband/sw/rxe/rxe.c     | 12 +++++++++++-
>>   drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>   drivers/infiniband/sw/rxe/rxe_net.h |  1 +
>>   3 files changed, 27 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>> --- a/drivers/infiniband/sw/rxe/rxe.c
>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>   /* called by ifc layer to create new rxe device.
>>    * The caller should allocate memory for rxe by calling ib_alloc_device.
>>    */
>> +static struct rdma_link_ops rxe_link_ops;
>>   int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>   {
>>   	rxe_init(rxe);
>>   	rxe_set_mtu(rxe, mtu);
>> +	rxe->ib_dev.link_ops = &rxe_link_ops;
>>   
>>   	return rxe_register_device(rxe, ibdev_name);
>>   }
>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>   	return err;
>>   }
>>   
>> -struct rdma_link_ops rxe_link_ops = {
>> +static int rxe_dellink(struct ib_device *dev)
>> +{
>> +	rxe_net_del(dev);
>> +
>> +	return 0;
>> +}
>> +
>> +static struct rdma_link_ops rxe_link_ops = {
>>   	.type = "rxe",
>>   	.newlink = rxe_newlink,
>> +	.dellink = rxe_dellink,
>>   };
>>   
>>   static int __init rxe_module_init(void)
>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>> index 3ca92e062800..4cc7de7b115b 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>   	return 0;
>>   }
>>   
>> +#define SK_REF_FOR_TUNNEL	2
>> +void rxe_net_del(struct ib_device *dev)
>> +{
>> +	if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>> +		__sock_put(recv_sockets.sk6->sk);
>> +	else
>> +		rxe_release_udp_tunnel(recv_sockets.sk6);
>> +
>> +	if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>> +		__sock_put(recv_sockets.sk4->sk);
>> +	else
>> +		rxe_release_udp_tunnel(recv_sockets.sk4);
>> +}
>> +#undef SK_REF_FOR_TUNNEL
>> +
>>   static void rxe_port_event(struct rxe_dev *rxe,
>>   			   enum ib_event_type event)
>>   {
>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>   
>>   void rxe_net_exit(void)
>>   {
>> -	rxe_release_udp_tunnel(recv_sockets.sk6);
>> -	rxe_release_udp_tunnel(recv_sockets.sk4);
>>   	unregister_netdevice_notifier(&rxe_net_notifier);
>>   }
> These calls are moved to rxe_net_del which is called by an explicit unlink command.
> But if rxe_net_init fails and returns an error code this will never happen.
> This will result in leaking resources.

Thanks a lot. Bob.

Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 
should be released.

I will fix it in the latest version.

Zhu Yanjun

>
> Bob
>>   
>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>> index a222c3eeae12..f48f22f3353b 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>   };
>>   
>>   int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>> +void rxe_net_del(struct ib_device *dev);
>>   
>>   int rxe_register_notifier(void);
>>   int rxe_net_init(void);
Bob Pearson June 21, 2023, 3:23 a.m. UTC | #3
On 6/20/23 21:13, Zhu Yanjun wrote:
> 
> 在 2023/6/21 4:21, Bob Pearson 写道:
>> On 4/28/23 04:39, Zhu Yanjun wrote:
>>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>
>>> When running "rdma link del" command, dellink function will be called.
>>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>>> the sock refcnt will be decreased by 1.
>>>
>>> If equal, the last rdma link is removed. The udp tunnel will be
>>> destroyed.
>>>
>>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>> ---
>>>   drivers/infiniband/sw/rxe/rxe.c     | 12 +++++++++++-
>>>   drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>>   drivers/infiniband/sw/rxe/rxe_net.h |  1 +
>>>   3 files changed, 27 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>>   /* called by ifc layer to create new rxe device.
>>>    * The caller should allocate memory for rxe by calling ib_alloc_device.
>>>    */
>>> +static struct rdma_link_ops rxe_link_ops;
>>>   int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>>   {
>>>       rxe_init(rxe);
>>>       rxe_set_mtu(rxe, mtu);
>>> +    rxe->ib_dev.link_ops = &rxe_link_ops;
>>>         return rxe_register_device(rxe, ibdev_name);
>>>   }
>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>>       return err;
>>>   }
>>>   -struct rdma_link_ops rxe_link_ops = {
>>> +static int rxe_dellink(struct ib_device *dev)
>>> +{
>>> +    rxe_net_del(dev);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static struct rdma_link_ops rxe_link_ops = {
>>>       .type = "rxe",
>>>       .newlink = rxe_newlink,
>>> +    .dellink = rxe_dellink,
>>>   };
>>>     static int __init rxe_module_init(void)
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>> index 3ca92e062800..4cc7de7b115b 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>>       return 0;
>>>   }
>>>   +#define SK_REF_FOR_TUNNEL    2
>>> +void rxe_net_del(struct ib_device *dev)
>>> +{
>>> +    if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>> +        __sock_put(recv_sockets.sk6->sk);
>>> +    else
>>> +        rxe_release_udp_tunnel(recv_sockets.sk6);
>>> +
>>> +    if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>> +        __sock_put(recv_sockets.sk4->sk);
>>> +    else
>>> +        rxe_release_udp_tunnel(recv_sockets.sk4);
>>> +}
>>> +#undef SK_REF_FOR_TUNNEL
>>> +
>>>   static void rxe_port_event(struct rxe_dev *rxe,
>>>                  enum ib_event_type event)
>>>   {
>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>>     void rxe_net_exit(void)
>>>   {
>>> -    rxe_release_udp_tunnel(recv_sockets.sk6);
>>> -    rxe_release_udp_tunnel(recv_sockets.sk4);
>>>       unregister_netdevice_notifier(&rxe_net_notifier);
>>>   }
>> These calls are moved to rxe_net_del which is called by an explicit unlink command.
>> But if rxe_net_init fails and returns an error code this will never happen.
>> This will result in leaking resources.
> 
> Thanks a lot. Bob.
> 
> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released.
> 
> I will fix it in the latest version.
> 
> Zhu Yanjun

I haven't had a chance to test netns yet. I am sure it works but I will test it.
The only other thing I noticed are some stylistic differences with the rest of
the rxe driver. You use

struct rxe_dev *rdev;

elsewhere it is

struct rxe_dev *rxe;

Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device.
rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver
specific ones. It's less typing that way.

With a couple of exceptions all the printk's are now in the form

rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to.

where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically
adapted from the siw driver.

Regards,

Bob

> 
>>
>> Bob
>>>   diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>>> index a222c3eeae12..f48f22f3353b 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>>   };
>>>     int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>>> +void rxe_net_del(struct ib_device *dev);
>>>     int rxe_register_notifier(void);
>>>   int rxe_net_init(void);
Zhu Yanjun June 21, 2023, 6:17 a.m. UTC | #4
在 2023/6/21 11:23, Bob Pearson 写道:
> On 6/20/23 21:13, Zhu Yanjun wrote:
>> 在 2023/6/21 4:21, Bob Pearson 写道:
>>> On 4/28/23 04:39, Zhu Yanjun wrote:
>>>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>
>>>> When running "rdma link del" command, dellink function will be called.
>>>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>>>> the sock refcnt will be decreased by 1.
>>>>
>>>> If equal, the last rdma link is removed. The udp tunnel will be
>>>> destroyed.
>>>>
>>>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>> ---
>>>>    drivers/infiniband/sw/rxe/rxe.c     | 12 +++++++++++-
>>>>    drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>>>    drivers/infiniband/sw/rxe/rxe_net.h |  1 +
>>>>    3 files changed, 27 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>>>    /* called by ifc layer to create new rxe device.
>>>>     * The caller should allocate memory for rxe by calling ib_alloc_device.
>>>>     */
>>>> +static struct rdma_link_ops rxe_link_ops;
>>>>    int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>>>    {
>>>>        rxe_init(rxe);
>>>>        rxe_set_mtu(rxe, mtu);
>>>> +    rxe->ib_dev.link_ops = &rxe_link_ops;
>>>>          return rxe_register_device(rxe, ibdev_name);
>>>>    }
>>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>>>        return err;
>>>>    }
>>>>    -struct rdma_link_ops rxe_link_ops = {
>>>> +static int rxe_dellink(struct ib_device *dev)
>>>> +{
>>>> +    rxe_net_del(dev);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static struct rdma_link_ops rxe_link_ops = {
>>>>        .type = "rxe",
>>>>        .newlink = rxe_newlink,
>>>> +    .dellink = rxe_dellink,
>>>>    };
>>>>      static int __init rxe_module_init(void)
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>>> index 3ca92e062800..4cc7de7b115b 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>>>        return 0;
>>>>    }
>>>>    +#define SK_REF_FOR_TUNNEL    2
>>>> +void rxe_net_del(struct ib_device *dev)
>>>> +{
>>>> +    if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>> +        __sock_put(recv_sockets.sk6->sk);
>>>> +    else
>>>> +        rxe_release_udp_tunnel(recv_sockets.sk6);
>>>> +
>>>> +    if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>> +        __sock_put(recv_sockets.sk4->sk);
>>>> +    else
>>>> +        rxe_release_udp_tunnel(recv_sockets.sk4);
>>>> +}
>>>> +#undef SK_REF_FOR_TUNNEL
>>>> +
>>>>    static void rxe_port_event(struct rxe_dev *rxe,
>>>>                   enum ib_event_type event)
>>>>    {
>>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>>>      void rxe_net_exit(void)
>>>>    {
>>>> -    rxe_release_udp_tunnel(recv_sockets.sk6);
>>>> -    rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>        unregister_netdevice_notifier(&rxe_net_notifier);
>>>>    }
>>> These calls are moved to rxe_net_del which is called by an explicit unlink command.
>>> But if rxe_net_init fails and returns an error code this will never happen.
>>> This will result in leaking resources.
>> Thanks a lot. Bob.
>>
>> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released.
>>
>> I will fix it in the latest version.
>>
>> Zhu Yanjun
> I haven't had a chance to test netns yet. I am sure it works but I will test it.
Yes. Please. It is an interesting feature.
> The only other thing I noticed are some stylistic differences with the rest of
> the rxe driver. You use
>
> struct rxe_dev *rdev;
>
> elsewhere it is
>
> struct rxe_dev *rxe;
>
> Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device.
> rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver
> specific ones. It's less typing that way.


Got you. I think we should use rxe instead of rdev. I will fix it in the 
latest commits.


>
> With a couple of exceptions all the printk's are now in the form
>
> rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to.
>
> where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically
> adapted from the siw driver.

If I can get you correctly, you mean that we should use rxe_dbg_qp, .... 
to replace pr_err ....

I have questions:

1). What benefit will this bring?

2). If the log is in module __init or  module __exit functions, we 
should use pr_xxx? Because obj does not exist in these __init and __exit 
functions.

Best Regards,

Zhu Yanjun

>
> Regards,
>
> Bob
>
>>> Bob
>>>>    diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>>>> index a222c3eeae12..f48f22f3353b 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>>>    };
>>>>      int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>>>> +void rxe_net_del(struct ib_device *dev);
>>>>      int rxe_register_notifier(void);
>>>>    int rxe_net_init(void);
Bob Pearson June 21, 2023, 4:24 p.m. UTC | #5
On 6/21/23 01:17, Zhu Yanjun wrote:
> 
> 在 2023/6/21 11:23, Bob Pearson 写道:
>> On 6/20/23 21:13, Zhu Yanjun wrote:
>>> 在 2023/6/21 4:21, Bob Pearson 写道:
>>>> On 4/28/23 04:39, Zhu Yanjun wrote:
>>>>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>>
>>>>> When running "rdma link del" command, dellink function will be called.
>>>>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>>>>> the sock refcnt will be decreased by 1.
>>>>>
>>>>> If equal, the last rdma link is removed. The udp tunnel will be
>>>>> destroyed.
>>>>>
>>>>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>> ---
>>>>>    drivers/infiniband/sw/rxe/rxe.c     | 12 +++++++++++-
>>>>>    drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>>>>    drivers/infiniband/sw/rxe/rxe_net.h |  1 +
>>>>>    3 files changed, 27 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>>>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>>>>    /* called by ifc layer to create new rxe device.
>>>>>     * The caller should allocate memory for rxe by calling ib_alloc_device.
>>>>>     */
>>>>> +static struct rdma_link_ops rxe_link_ops;
>>>>>    int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>>>>    {
>>>>>        rxe_init(rxe);
>>>>>        rxe_set_mtu(rxe, mtu);
>>>>> +    rxe->ib_dev.link_ops = &rxe_link_ops;
>>>>>          return rxe_register_device(rxe, ibdev_name);
>>>>>    }
>>>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>>>>        return err;
>>>>>    }
>>>>>    -struct rdma_link_ops rxe_link_ops = {
>>>>> +static int rxe_dellink(struct ib_device *dev)
>>>>> +{
>>>>> +    rxe_net_del(dev);
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static struct rdma_link_ops rxe_link_ops = {
>>>>>        .type = "rxe",
>>>>>        .newlink = rxe_newlink,
>>>>> +    .dellink = rxe_dellink,
>>>>>    };
>>>>>      static int __init rxe_module_init(void)
>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>>>> index 3ca92e062800..4cc7de7b115b 100644
>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>>>>        return 0;
>>>>>    }
>>>>>    +#define SK_REF_FOR_TUNNEL    2
>>>>> +void rxe_net_del(struct ib_device *dev)
>>>>> +{
>>>>> +    if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>>> +        __sock_put(recv_sockets.sk6->sk);
>>>>> +    else
>>>>> +        rxe_release_udp_tunnel(recv_sockets.sk6);
>>>>> +
>>>>> +    if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>>> +        __sock_put(recv_sockets.sk4->sk);
>>>>> +    else
>>>>> +        rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>> +}
>>>>> +#undef SK_REF_FOR_TUNNEL
>>>>> +
>>>>>    static void rxe_port_event(struct rxe_dev *rxe,
>>>>>                   enum ib_event_type event)
>>>>>    {
>>>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>>>>      void rxe_net_exit(void)
>>>>>    {
>>>>> -    rxe_release_udp_tunnel(recv_sockets.sk6);
>>>>> -    rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>>        unregister_netdevice_notifier(&rxe_net_notifier);
>>>>>    }
>>>> These calls are moved to rxe_net_del which is called by an explicit unlink command.
>>>> But if rxe_net_init fails and returns an error code this will never happen.
>>>> This will result in leaking resources.
>>> Thanks a lot. Bob.
>>>
>>> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released.
>>>
>>> I will fix it in the latest version.
>>>
>>> Zhu Yanjun
>> I haven't had a chance to test netns yet. I am sure it works but I will test it.
> Yes. Please. It is an interesting feature.
>> The only other thing I noticed are some stylistic differences with the rest of
>> the rxe driver. You use
>>
>> struct rxe_dev *rdev;
>>
>> elsewhere it is
>>
>> struct rxe_dev *rxe;
>>
>> Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device.
>> rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver
>> specific ones. It's less typing that way.
> 
> 
> Got you. I think we should use rxe instead of rdev. I will fix it in the latest commits.
> 
> 
>>
>> With a couple of exceptions all the printk's are now in the form
>>
>> rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to.
>>
>> where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically
>> adapted from the siw driver.
> 
> If I can get you correctly, you mean that we should use rxe_dbg_qp, .... to replace pr_err ....
> 
> I have questions:
> 
> 1). What benefit will this bring?
> 
> 2). If the log is in module __init or  module __exit functions, we should use pr_xxx? Because obj does not exist in these __init and __exit functions.
I think that is the way the driver is now. Go ahead.
> 
> Best Regards,
> 
> Zhu Yanjun
> 
>>
>> Regards,
>>
>> Bob
>>
>>>> Bob
>>>>>    diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>>>>> index a222c3eeae12..f48f22f3353b 100644
>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>>>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>>>>    };
>>>>>      int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>>>>> +void rxe_net_del(struct ib_device *dev);
>>>>>      int rxe_register_notifier(void);
>>>>>    int rxe_net_init(void);
Zhu Yanjun June 23, 2023, 7:19 a.m. UTC | #6
在 2023/6/22 0:24, Bob Pearson 写道:
> On 6/21/23 01:17, Zhu Yanjun wrote:
>>
>> 在 2023/6/21 11:23, Bob Pearson 写道:
>>> On 6/20/23 21:13, Zhu Yanjun wrote:
>>>> 在 2023/6/21 4:21, Bob Pearson 写道:
>>>>> On 4/28/23 04:39, Zhu Yanjun wrote:
>>>>>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>>>
>>>>>> When running "rdma link del" command, dellink function will be called.
>>>>>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>>>>>> the sock refcnt will be decreased by 1.
>>>>>>
>>>>>> If equal, the last rdma link is removed. The udp tunnel will be
>>>>>> destroyed.
>>>>>>
>>>>>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>>>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>>> ---
>>>>>>     drivers/infiniband/sw/rxe/rxe.c     | 12 +++++++++++-
>>>>>>     drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>>>>>     drivers/infiniband/sw/rxe/rxe_net.h |  1 +
>>>>>>     3 files changed, 27 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>>>>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>>>>>     /* called by ifc layer to create new rxe device.
>>>>>>      * The caller should allocate memory for rxe by calling ib_alloc_device.
>>>>>>      */
>>>>>> +static struct rdma_link_ops rxe_link_ops;
>>>>>>     int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>>>>>     {
>>>>>>         rxe_init(rxe);
>>>>>>         rxe_set_mtu(rxe, mtu);
>>>>>> +    rxe->ib_dev.link_ops = &rxe_link_ops;
>>>>>>           return rxe_register_device(rxe, ibdev_name);
>>>>>>     }
>>>>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>>>>>         return err;
>>>>>>     }
>>>>>>     -struct rdma_link_ops rxe_link_ops = {
>>>>>> +static int rxe_dellink(struct ib_device *dev)
>>>>>> +{
>>>>>> +    rxe_net_del(dev);
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static struct rdma_link_ops rxe_link_ops = {
>>>>>>         .type = "rxe",
>>>>>>         .newlink = rxe_newlink,
>>>>>> +    .dellink = rxe_dellink,
>>>>>>     };
>>>>>>       static int __init rxe_module_init(void)
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>>>>> index 3ca92e062800..4cc7de7b115b 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>>>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>>>>>         return 0;
>>>>>>     }
>>>>>>     +#define SK_REF_FOR_TUNNEL    2
>>>>>> +void rxe_net_del(struct ib_device *dev)
>>>>>> +{
>>>>>> +    if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>>>> +        __sock_put(recv_sockets.sk6->sk);
>>>>>> +    else
>>>>>> +        rxe_release_udp_tunnel(recv_sockets.sk6);
>>>>>> +
>>>>>> +    if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>>>> +        __sock_put(recv_sockets.sk4->sk);
>>>>>> +    else
>>>>>> +        rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>>> +}
>>>>>> +#undef SK_REF_FOR_TUNNEL
>>>>>> +
>>>>>>     static void rxe_port_event(struct rxe_dev *rxe,
>>>>>>                    enum ib_event_type event)
>>>>>>     {
>>>>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>>>>>       void rxe_net_exit(void)
>>>>>>     {
>>>>>> -    rxe_release_udp_tunnel(recv_sockets.sk6);
>>>>>> -    rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>>>         unregister_netdevice_notifier(&rxe_net_notifier);
>>>>>>     }
>>>>> These calls are moved to rxe_net_del which is called by an explicit unlink command.
>>>>> But if rxe_net_init fails and returns an error code this will never happen.
>>>>> This will result in leaking resources.
>>>> Thanks a lot. Bob.
>>>>
>>>> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released.
>>>>
>>>> I will fix it in the latest version.
>>>>
>>>> Zhu Yanjun
>>> I haven't had a chance to test netns yet. I am sure it works but I will test it.
>> Yes. Please. It is an interesting feature.
>>> The only other thing I noticed are some stylistic differences with the rest of
>>> the rxe driver. You use
>>>
>>> struct rxe_dev *rdev;
>>>
>>> elsewhere it is
>>>
>>> struct rxe_dev *rxe;
>>>
>>> Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device.
>>> rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver
>>> specific ones. It's less typing that way.
>>
>>
>> Got you. I think we should use rxe instead of rdev. I will fix it in the latest commits.
>>
>>
>>>
>>> With a couple of exceptions all the printk's are now in the form
>>>
>>> rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to.
>>>
>>> where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically
>>> adapted from the siw driver.
>>
>> If I can get you correctly, you mean that we should use rxe_dbg_qp, .... to replace pr_err ....
>>
>> I have questions:
>>
>> 1). What benefit will this bring?
>>
>> 2). If the log is in module __init or  module __exit functions, we should use pr_xxx? Because obj does not exist in these __init and __exit functions.
> I think that is the way the driver is now. Go ahead.

OK. The latest commits will be sent out very soon.

Zhu Yanjun

>>
>> Best Regards,
>>
>> Zhu Yanjun
>>
>>>
>>> Regards,
>>>
>>> Bob
>>>
>>>>> Bob
>>>>>>     diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>>>>>> index a222c3eeae12..f48f22f3353b 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>>>>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>>>>>     };
>>>>>>       int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>>>>>> +void rxe_net_del(struct ib_device *dev);
>>>>>>       int rxe_register_notifier(void);
>>>>>>     int rxe_net_init(void);
>
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 0ce6adb43cfc..ebfabc6d6b76 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -166,10 +166,12 @@  void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
 /* called by ifc layer to create new rxe device.
  * The caller should allocate memory for rxe by calling ib_alloc_device.
  */
+static struct rdma_link_ops rxe_link_ops;
 int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
 {
 	rxe_init(rxe);
 	rxe_set_mtu(rxe, mtu);
+	rxe->ib_dev.link_ops = &rxe_link_ops;
 
 	return rxe_register_device(rxe, ibdev_name);
 }
@@ -206,9 +208,17 @@  static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
 	return err;
 }
 
-struct rdma_link_ops rxe_link_ops = {
+static int rxe_dellink(struct ib_device *dev)
+{
+	rxe_net_del(dev);
+
+	return 0;
+}
+
+static struct rdma_link_ops rxe_link_ops = {
 	.type = "rxe",
 	.newlink = rxe_newlink,
+	.dellink = rxe_dellink,
 };
 
 static int __init rxe_module_init(void)
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 3ca92e062800..4cc7de7b115b 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -530,6 +530,21 @@  int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
 	return 0;
 }
 
+#define SK_REF_FOR_TUNNEL	2
+void rxe_net_del(struct ib_device *dev)
+{
+	if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
+		__sock_put(recv_sockets.sk6->sk);
+	else
+		rxe_release_udp_tunnel(recv_sockets.sk6);
+
+	if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
+		__sock_put(recv_sockets.sk4->sk);
+	else
+		rxe_release_udp_tunnel(recv_sockets.sk4);
+}
+#undef SK_REF_FOR_TUNNEL
+
 static void rxe_port_event(struct rxe_dev *rxe,
 			   enum ib_event_type event)
 {
@@ -689,8 +704,6 @@  int rxe_register_notifier(void)
 
 void rxe_net_exit(void)
 {
-	rxe_release_udp_tunnel(recv_sockets.sk6);
-	rxe_release_udp_tunnel(recv_sockets.sk4);
 	unregister_netdevice_notifier(&rxe_net_notifier);
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index a222c3eeae12..f48f22f3353b 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -17,6 +17,7 @@  struct rxe_recv_sockets {
 };
 
 int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
+void rxe_net_del(struct ib_device *dev);
 
 int rxe_register_notifier(void);
 int rxe_net_init(void);