Message ID | 20230428093914.2121131-5-yanjun.zhu@intel.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Fix the problem that rxe can not work in net namespace | expand |
On 4/28/23 04:39, Zhu Yanjun wrote: > From: Zhu Yanjun <yanjun.zhu@linux.dev> > > When running "rdma link del" command, dellink function will be called. > If the sock refcnt is greater than the refcnt needed for udp tunnel, > the sock refcnt will be decreased by 1. > > If equal, the last rdma link is removed. The udp tunnel will be > destroyed. > > Tested-by: Rain River <rain.1986.08.12@gmail.com> > Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> > --- > drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++- > drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++-- > drivers/infiniband/sw/rxe/rxe_net.h | 1 + > 3 files changed, 27 insertions(+), 3 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index 0ce6adb43cfc..ebfabc6d6b76 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) > /* called by ifc layer to create new rxe device. > * The caller should allocate memory for rxe by calling ib_alloc_device. > */ > +static struct rdma_link_ops rxe_link_ops; > int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) > { > rxe_init(rxe); > rxe_set_mtu(rxe, mtu); > + rxe->ib_dev.link_ops = &rxe_link_ops; > > return rxe_register_device(rxe, ibdev_name); > } > @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) > return err; > } > > -struct rdma_link_ops rxe_link_ops = { > +static int rxe_dellink(struct ib_device *dev) > +{ > + rxe_net_del(dev); > + > + return 0; > +} > + > +static struct rdma_link_ops rxe_link_ops = { > .type = "rxe", > .newlink = rxe_newlink, > + .dellink = rxe_dellink, > }; > > static int __init rxe_module_init(void) > diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c > index 3ca92e062800..4cc7de7b115b 100644 > --- a/drivers/infiniband/sw/rxe/rxe_net.c > +++ b/drivers/infiniband/sw/rxe/rxe_net.c > @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) > return 0; > } > > +#define SK_REF_FOR_TUNNEL 2 > +void rxe_net_del(struct ib_device *dev) > +{ > + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) > + __sock_put(recv_sockets.sk6->sk); > + else > + rxe_release_udp_tunnel(recv_sockets.sk6); > + > + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) > + __sock_put(recv_sockets.sk4->sk); > + else > + rxe_release_udp_tunnel(recv_sockets.sk4); > +} > +#undef SK_REF_FOR_TUNNEL > + > static void rxe_port_event(struct rxe_dev *rxe, > enum ib_event_type event) > { > @@ -689,8 +704,6 @@ int rxe_register_notifier(void) > > void rxe_net_exit(void) > { > - rxe_release_udp_tunnel(recv_sockets.sk6); > - rxe_release_udp_tunnel(recv_sockets.sk4); > unregister_netdevice_notifier(&rxe_net_notifier); > } These calls are moved to rxe_net_del which is called by an explicit unlink command. But if rxe_net_init fails and returns an error code this will never happen. This will result in leaking resources. Bob > > diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h > index a222c3eeae12..f48f22f3353b 100644 > --- a/drivers/infiniband/sw/rxe/rxe_net.h > +++ b/drivers/infiniband/sw/rxe/rxe_net.h > @@ -17,6 +17,7 @@ struct rxe_recv_sockets { > }; > > int rxe_net_add(const char *ibdev_name, struct net_device *ndev); > +void rxe_net_del(struct ib_device *dev); > > int rxe_register_notifier(void); > int rxe_net_init(void);
在 2023/6/21 4:21, Bob Pearson 写道: > On 4/28/23 04:39, Zhu Yanjun wrote: >> From: Zhu Yanjun <yanjun.zhu@linux.dev> >> >> When running "rdma link del" command, dellink function will be called. >> If the sock refcnt is greater than the refcnt needed for udp tunnel, >> the sock refcnt will be decreased by 1. >> >> If equal, the last rdma link is removed. The udp tunnel will be >> destroyed. >> >> Tested-by: Rain River <rain.1986.08.12@gmail.com> >> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> >> --- >> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++- >> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++-- >> drivers/infiniband/sw/rxe/rxe_net.h | 1 + >> 3 files changed, 27 insertions(+), 3 deletions(-) >> >> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c >> index 0ce6adb43cfc..ebfabc6d6b76 100644 >> --- a/drivers/infiniband/sw/rxe/rxe.c >> +++ b/drivers/infiniband/sw/rxe/rxe.c >> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) >> /* called by ifc layer to create new rxe device. >> * The caller should allocate memory for rxe by calling ib_alloc_device. >> */ >> +static struct rdma_link_ops rxe_link_ops; >> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) >> { >> rxe_init(rxe); >> rxe_set_mtu(rxe, mtu); >> + rxe->ib_dev.link_ops = &rxe_link_ops; >> >> return rxe_register_device(rxe, ibdev_name); >> } >> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) >> return err; >> } >> >> -struct rdma_link_ops rxe_link_ops = { >> +static int rxe_dellink(struct ib_device *dev) >> +{ >> + rxe_net_del(dev); >> + >> + return 0; >> +} >> + >> +static struct rdma_link_ops rxe_link_ops = { >> .type = "rxe", >> .newlink = rxe_newlink, >> + .dellink = rxe_dellink, >> }; >> >> static int __init rxe_module_init(void) >> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c >> index 3ca92e062800..4cc7de7b115b 100644 >> --- a/drivers/infiniband/sw/rxe/rxe_net.c >> +++ b/drivers/infiniband/sw/rxe/rxe_net.c >> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) >> return 0; >> } >> >> +#define SK_REF_FOR_TUNNEL 2 >> +void rxe_net_del(struct ib_device *dev) >> +{ >> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >> + __sock_put(recv_sockets.sk6->sk); >> + else >> + rxe_release_udp_tunnel(recv_sockets.sk6); >> + >> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >> + __sock_put(recv_sockets.sk4->sk); >> + else >> + rxe_release_udp_tunnel(recv_sockets.sk4); >> +} >> +#undef SK_REF_FOR_TUNNEL >> + >> static void rxe_port_event(struct rxe_dev *rxe, >> enum ib_event_type event) >> { >> @@ -689,8 +704,6 @@ int rxe_register_notifier(void) >> >> void rxe_net_exit(void) >> { >> - rxe_release_udp_tunnel(recv_sockets.sk6); >> - rxe_release_udp_tunnel(recv_sockets.sk4); >> unregister_netdevice_notifier(&rxe_net_notifier); >> } > These calls are moved to rxe_net_del which is called by an explicit unlink command. > But if rxe_net_init fails and returns an error code this will never happen. > This will result in leaking resources. Thanks a lot. Bob. Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released. I will fix it in the latest version. Zhu Yanjun > > Bob >> >> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h >> index a222c3eeae12..f48f22f3353b 100644 >> --- a/drivers/infiniband/sw/rxe/rxe_net.h >> +++ b/drivers/infiniband/sw/rxe/rxe_net.h >> @@ -17,6 +17,7 @@ struct rxe_recv_sockets { >> }; >> >> int rxe_net_add(const char *ibdev_name, struct net_device *ndev); >> +void rxe_net_del(struct ib_device *dev); >> >> int rxe_register_notifier(void); >> int rxe_net_init(void);
On 6/20/23 21:13, Zhu Yanjun wrote: > > 在 2023/6/21 4:21, Bob Pearson 写道: >> On 4/28/23 04:39, Zhu Yanjun wrote: >>> From: Zhu Yanjun <yanjun.zhu@linux.dev> >>> >>> When running "rdma link del" command, dellink function will be called. >>> If the sock refcnt is greater than the refcnt needed for udp tunnel, >>> the sock refcnt will be decreased by 1. >>> >>> If equal, the last rdma link is removed. The udp tunnel will be >>> destroyed. >>> >>> Tested-by: Rain River <rain.1986.08.12@gmail.com> >>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> >>> --- >>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++- >>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++-- >>> drivers/infiniband/sw/rxe/rxe_net.h | 1 + >>> 3 files changed, 27 insertions(+), 3 deletions(-) >>> >>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c >>> index 0ce6adb43cfc..ebfabc6d6b76 100644 >>> --- a/drivers/infiniband/sw/rxe/rxe.c >>> +++ b/drivers/infiniband/sw/rxe/rxe.c >>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) >>> /* called by ifc layer to create new rxe device. >>> * The caller should allocate memory for rxe by calling ib_alloc_device. >>> */ >>> +static struct rdma_link_ops rxe_link_ops; >>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) >>> { >>> rxe_init(rxe); >>> rxe_set_mtu(rxe, mtu); >>> + rxe->ib_dev.link_ops = &rxe_link_ops; >>> return rxe_register_device(rxe, ibdev_name); >>> } >>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) >>> return err; >>> } >>> -struct rdma_link_ops rxe_link_ops = { >>> +static int rxe_dellink(struct ib_device *dev) >>> +{ >>> + rxe_net_del(dev); >>> + >>> + return 0; >>> +} >>> + >>> +static struct rdma_link_ops rxe_link_ops = { >>> .type = "rxe", >>> .newlink = rxe_newlink, >>> + .dellink = rxe_dellink, >>> }; >>> static int __init rxe_module_init(void) >>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c >>> index 3ca92e062800..4cc7de7b115b 100644 >>> --- a/drivers/infiniband/sw/rxe/rxe_net.c >>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c >>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) >>> return 0; >>> } >>> +#define SK_REF_FOR_TUNNEL 2 >>> +void rxe_net_del(struct ib_device *dev) >>> +{ >>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >>> + __sock_put(recv_sockets.sk6->sk); >>> + else >>> + rxe_release_udp_tunnel(recv_sockets.sk6); >>> + >>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >>> + __sock_put(recv_sockets.sk4->sk); >>> + else >>> + rxe_release_udp_tunnel(recv_sockets.sk4); >>> +} >>> +#undef SK_REF_FOR_TUNNEL >>> + >>> static void rxe_port_event(struct rxe_dev *rxe, >>> enum ib_event_type event) >>> { >>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void) >>> void rxe_net_exit(void) >>> { >>> - rxe_release_udp_tunnel(recv_sockets.sk6); >>> - rxe_release_udp_tunnel(recv_sockets.sk4); >>> unregister_netdevice_notifier(&rxe_net_notifier); >>> } >> These calls are moved to rxe_net_del which is called by an explicit unlink command. >> But if rxe_net_init fails and returns an error code this will never happen. >> This will result in leaking resources. > > Thanks a lot. Bob. > > Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released. > > I will fix it in the latest version. > > Zhu Yanjun I haven't had a chance to test netns yet. I am sure it works but I will test it. The only other thing I noticed are some stylistic differences with the rest of the rxe driver. You use struct rxe_dev *rdev; elsewhere it is struct rxe_dev *rxe; Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device. rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver specific ones. It's less typing that way. With a couple of exceptions all the printk's are now in the form rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to. where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically adapted from the siw driver. Regards, Bob > >> >> Bob >>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h >>> index a222c3eeae12..f48f22f3353b 100644 >>> --- a/drivers/infiniband/sw/rxe/rxe_net.h >>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h >>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets { >>> }; >>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev); >>> +void rxe_net_del(struct ib_device *dev); >>> int rxe_register_notifier(void); >>> int rxe_net_init(void);
在 2023/6/21 11:23, Bob Pearson 写道: > On 6/20/23 21:13, Zhu Yanjun wrote: >> 在 2023/6/21 4:21, Bob Pearson 写道: >>> On 4/28/23 04:39, Zhu Yanjun wrote: >>>> From: Zhu Yanjun <yanjun.zhu@linux.dev> >>>> >>>> When running "rdma link del" command, dellink function will be called. >>>> If the sock refcnt is greater than the refcnt needed for udp tunnel, >>>> the sock refcnt will be decreased by 1. >>>> >>>> If equal, the last rdma link is removed. The udp tunnel will be >>>> destroyed. >>>> >>>> Tested-by: Rain River <rain.1986.08.12@gmail.com> >>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> >>>> --- >>>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++- >>>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++-- >>>> drivers/infiniband/sw/rxe/rxe_net.h | 1 + >>>> 3 files changed, 27 insertions(+), 3 deletions(-) >>>> >>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c >>>> index 0ce6adb43cfc..ebfabc6d6b76 100644 >>>> --- a/drivers/infiniband/sw/rxe/rxe.c >>>> +++ b/drivers/infiniband/sw/rxe/rxe.c >>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) >>>> /* called by ifc layer to create new rxe device. >>>> * The caller should allocate memory for rxe by calling ib_alloc_device. >>>> */ >>>> +static struct rdma_link_ops rxe_link_ops; >>>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) >>>> { >>>> rxe_init(rxe); >>>> rxe_set_mtu(rxe, mtu); >>>> + rxe->ib_dev.link_ops = &rxe_link_ops; >>>> return rxe_register_device(rxe, ibdev_name); >>>> } >>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) >>>> return err; >>>> } >>>> -struct rdma_link_ops rxe_link_ops = { >>>> +static int rxe_dellink(struct ib_device *dev) >>>> +{ >>>> + rxe_net_del(dev); >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static struct rdma_link_ops rxe_link_ops = { >>>> .type = "rxe", >>>> .newlink = rxe_newlink, >>>> + .dellink = rxe_dellink, >>>> }; >>>> static int __init rxe_module_init(void) >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c >>>> index 3ca92e062800..4cc7de7b115b 100644 >>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c >>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c >>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) >>>> return 0; >>>> } >>>> +#define SK_REF_FOR_TUNNEL 2 >>>> +void rxe_net_del(struct ib_device *dev) >>>> +{ >>>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >>>> + __sock_put(recv_sockets.sk6->sk); >>>> + else >>>> + rxe_release_udp_tunnel(recv_sockets.sk6); >>>> + >>>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >>>> + __sock_put(recv_sockets.sk4->sk); >>>> + else >>>> + rxe_release_udp_tunnel(recv_sockets.sk4); >>>> +} >>>> +#undef SK_REF_FOR_TUNNEL >>>> + >>>> static void rxe_port_event(struct rxe_dev *rxe, >>>> enum ib_event_type event) >>>> { >>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void) >>>> void rxe_net_exit(void) >>>> { >>>> - rxe_release_udp_tunnel(recv_sockets.sk6); >>>> - rxe_release_udp_tunnel(recv_sockets.sk4); >>>> unregister_netdevice_notifier(&rxe_net_notifier); >>>> } >>> These calls are moved to rxe_net_del which is called by an explicit unlink command. >>> But if rxe_net_init fails and returns an error code this will never happen. >>> This will result in leaking resources. >> Thanks a lot. Bob. >> >> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released. >> >> I will fix it in the latest version. >> >> Zhu Yanjun > I haven't had a chance to test netns yet. I am sure it works but I will test it. Yes. Please. It is an interesting feature. > The only other thing I noticed are some stylistic differences with the rest of > the rxe driver. You use > > struct rxe_dev *rdev; > > elsewhere it is > > struct rxe_dev *rxe; > > Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device. > rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver > specific ones. It's less typing that way. Got you. I think we should use rxe instead of rdev. I will fix it in the latest commits. > > With a couple of exceptions all the printk's are now in the form > > rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to. > > where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically > adapted from the siw driver. If I can get you correctly, you mean that we should use rxe_dbg_qp, .... to replace pr_err .... I have questions: 1). What benefit will this bring? 2). If the log is in module __init or module __exit functions, we should use pr_xxx? Because obj does not exist in these __init and __exit functions. Best Regards, Zhu Yanjun > > Regards, > > Bob > >>> Bob >>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h >>>> index a222c3eeae12..f48f22f3353b 100644 >>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h >>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h >>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets { >>>> }; >>>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev); >>>> +void rxe_net_del(struct ib_device *dev); >>>> int rxe_register_notifier(void); >>>> int rxe_net_init(void);
On 6/21/23 01:17, Zhu Yanjun wrote: > > 在 2023/6/21 11:23, Bob Pearson 写道: >> On 6/20/23 21:13, Zhu Yanjun wrote: >>> 在 2023/6/21 4:21, Bob Pearson 写道: >>>> On 4/28/23 04:39, Zhu Yanjun wrote: >>>>> From: Zhu Yanjun <yanjun.zhu@linux.dev> >>>>> >>>>> When running "rdma link del" command, dellink function will be called. >>>>> If the sock refcnt is greater than the refcnt needed for udp tunnel, >>>>> the sock refcnt will be decreased by 1. >>>>> >>>>> If equal, the last rdma link is removed. The udp tunnel will be >>>>> destroyed. >>>>> >>>>> Tested-by: Rain River <rain.1986.08.12@gmail.com> >>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> >>>>> --- >>>>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++- >>>>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++-- >>>>> drivers/infiniband/sw/rxe/rxe_net.h | 1 + >>>>> 3 files changed, 27 insertions(+), 3 deletions(-) >>>>> >>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c >>>>> index 0ce6adb43cfc..ebfabc6d6b76 100644 >>>>> --- a/drivers/infiniband/sw/rxe/rxe.c >>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c >>>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) >>>>> /* called by ifc layer to create new rxe device. >>>>> * The caller should allocate memory for rxe by calling ib_alloc_device. >>>>> */ >>>>> +static struct rdma_link_ops rxe_link_ops; >>>>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) >>>>> { >>>>> rxe_init(rxe); >>>>> rxe_set_mtu(rxe, mtu); >>>>> + rxe->ib_dev.link_ops = &rxe_link_ops; >>>>> return rxe_register_device(rxe, ibdev_name); >>>>> } >>>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) >>>>> return err; >>>>> } >>>>> -struct rdma_link_ops rxe_link_ops = { >>>>> +static int rxe_dellink(struct ib_device *dev) >>>>> +{ >>>>> + rxe_net_del(dev); >>>>> + >>>>> + return 0; >>>>> +} >>>>> + >>>>> +static struct rdma_link_ops rxe_link_ops = { >>>>> .type = "rxe", >>>>> .newlink = rxe_newlink, >>>>> + .dellink = rxe_dellink, >>>>> }; >>>>> static int __init rxe_module_init(void) >>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c >>>>> index 3ca92e062800..4cc7de7b115b 100644 >>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c >>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c >>>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) >>>>> return 0; >>>>> } >>>>> +#define SK_REF_FOR_TUNNEL 2 >>>>> +void rxe_net_del(struct ib_device *dev) >>>>> +{ >>>>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >>>>> + __sock_put(recv_sockets.sk6->sk); >>>>> + else >>>>> + rxe_release_udp_tunnel(recv_sockets.sk6); >>>>> + >>>>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >>>>> + __sock_put(recv_sockets.sk4->sk); >>>>> + else >>>>> + rxe_release_udp_tunnel(recv_sockets.sk4); >>>>> +} >>>>> +#undef SK_REF_FOR_TUNNEL >>>>> + >>>>> static void rxe_port_event(struct rxe_dev *rxe, >>>>> enum ib_event_type event) >>>>> { >>>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void) >>>>> void rxe_net_exit(void) >>>>> { >>>>> - rxe_release_udp_tunnel(recv_sockets.sk6); >>>>> - rxe_release_udp_tunnel(recv_sockets.sk4); >>>>> unregister_netdevice_notifier(&rxe_net_notifier); >>>>> } >>>> These calls are moved to rxe_net_del which is called by an explicit unlink command. >>>> But if rxe_net_init fails and returns an error code this will never happen. >>>> This will result in leaking resources. >>> Thanks a lot. Bob. >>> >>> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released. >>> >>> I will fix it in the latest version. >>> >>> Zhu Yanjun >> I haven't had a chance to test netns yet. I am sure it works but I will test it. > Yes. Please. It is an interesting feature. >> The only other thing I noticed are some stylistic differences with the rest of >> the rxe driver. You use >> >> struct rxe_dev *rdev; >> >> elsewhere it is >> >> struct rxe_dev *rxe; >> >> Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device. >> rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver >> specific ones. It's less typing that way. > > > Got you. I think we should use rxe instead of rdev. I will fix it in the latest commits. > > >> >> With a couple of exceptions all the printk's are now in the form >> >> rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to. >> >> where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically >> adapted from the siw driver. > > If I can get you correctly, you mean that we should use rxe_dbg_qp, .... to replace pr_err .... > > I have questions: > > 1). What benefit will this bring? > > 2). If the log is in module __init or module __exit functions, we should use pr_xxx? Because obj does not exist in these __init and __exit functions. I think that is the way the driver is now. Go ahead. > > Best Regards, > > Zhu Yanjun > >> >> Regards, >> >> Bob >> >>>> Bob >>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h >>>>> index a222c3eeae12..f48f22f3353b 100644 >>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h >>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h >>>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets { >>>>> }; >>>>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev); >>>>> +void rxe_net_del(struct ib_device *dev); >>>>> int rxe_register_notifier(void); >>>>> int rxe_net_init(void);
在 2023/6/22 0:24, Bob Pearson 写道: > On 6/21/23 01:17, Zhu Yanjun wrote: >> >> 在 2023/6/21 11:23, Bob Pearson 写道: >>> On 6/20/23 21:13, Zhu Yanjun wrote: >>>> 在 2023/6/21 4:21, Bob Pearson 写道: >>>>> On 4/28/23 04:39, Zhu Yanjun wrote: >>>>>> From: Zhu Yanjun <yanjun.zhu@linux.dev> >>>>>> >>>>>> When running "rdma link del" command, dellink function will be called. >>>>>> If the sock refcnt is greater than the refcnt needed for udp tunnel, >>>>>> the sock refcnt will be decreased by 1. >>>>>> >>>>>> If equal, the last rdma link is removed. The udp tunnel will be >>>>>> destroyed. >>>>>> >>>>>> Tested-by: Rain River <rain.1986.08.12@gmail.com> >>>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> >>>>>> --- >>>>>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++- >>>>>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++-- >>>>>> drivers/infiniband/sw/rxe/rxe_net.h | 1 + >>>>>> 3 files changed, 27 insertions(+), 3 deletions(-) >>>>>> >>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c >>>>>> index 0ce6adb43cfc..ebfabc6d6b76 100644 >>>>>> --- a/drivers/infiniband/sw/rxe/rxe.c >>>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c >>>>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) >>>>>> /* called by ifc layer to create new rxe device. >>>>>> * The caller should allocate memory for rxe by calling ib_alloc_device. >>>>>> */ >>>>>> +static struct rdma_link_ops rxe_link_ops; >>>>>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) >>>>>> { >>>>>> rxe_init(rxe); >>>>>> rxe_set_mtu(rxe, mtu); >>>>>> + rxe->ib_dev.link_ops = &rxe_link_ops; >>>>>> return rxe_register_device(rxe, ibdev_name); >>>>>> } >>>>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) >>>>>> return err; >>>>>> } >>>>>> -struct rdma_link_ops rxe_link_ops = { >>>>>> +static int rxe_dellink(struct ib_device *dev) >>>>>> +{ >>>>>> + rxe_net_del(dev); >>>>>> + >>>>>> + return 0; >>>>>> +} >>>>>> + >>>>>> +static struct rdma_link_ops rxe_link_ops = { >>>>>> .type = "rxe", >>>>>> .newlink = rxe_newlink, >>>>>> + .dellink = rxe_dellink, >>>>>> }; >>>>>> static int __init rxe_module_init(void) >>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c >>>>>> index 3ca92e062800..4cc7de7b115b 100644 >>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c >>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c >>>>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) >>>>>> return 0; >>>>>> } >>>>>> +#define SK_REF_FOR_TUNNEL 2 >>>>>> +void rxe_net_del(struct ib_device *dev) >>>>>> +{ >>>>>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >>>>>> + __sock_put(recv_sockets.sk6->sk); >>>>>> + else >>>>>> + rxe_release_udp_tunnel(recv_sockets.sk6); >>>>>> + >>>>>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) >>>>>> + __sock_put(recv_sockets.sk4->sk); >>>>>> + else >>>>>> + rxe_release_udp_tunnel(recv_sockets.sk4); >>>>>> +} >>>>>> +#undef SK_REF_FOR_TUNNEL >>>>>> + >>>>>> static void rxe_port_event(struct rxe_dev *rxe, >>>>>> enum ib_event_type event) >>>>>> { >>>>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void) >>>>>> void rxe_net_exit(void) >>>>>> { >>>>>> - rxe_release_udp_tunnel(recv_sockets.sk6); >>>>>> - rxe_release_udp_tunnel(recv_sockets.sk4); >>>>>> unregister_netdevice_notifier(&rxe_net_notifier); >>>>>> } >>>>> These calls are moved to rxe_net_del which is called by an explicit unlink command. >>>>> But if rxe_net_init fails and returns an error code this will never happen. >>>>> This will result in leaking resources. >>>> Thanks a lot. Bob. >>>> >>>> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released. >>>> >>>> I will fix it in the latest version. >>>> >>>> Zhu Yanjun >>> I haven't had a chance to test netns yet. I am sure it works but I will test it. >> Yes. Please. It is an interesting feature. >>> The only other thing I noticed are some stylistic differences with the rest of >>> the rxe driver. You use >>> >>> struct rxe_dev *rdev; >>> >>> elsewhere it is >>> >>> struct rxe_dev *rxe; >>> >>> Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device. >>> rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver >>> specific ones. It's less typing that way. >> >> >> Got you. I think we should use rxe instead of rdev. I will fix it in the latest commits. >> >> >>> >>> With a couple of exceptions all the printk's are now in the form >>> >>> rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to. >>> >>> where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically >>> adapted from the siw driver. >> >> If I can get you correctly, you mean that we should use rxe_dbg_qp, .... to replace pr_err .... >> >> I have questions: >> >> 1). What benefit will this bring? >> >> 2). If the log is in module __init or module __exit functions, we should use pr_xxx? Because obj does not exist in these __init and __exit functions. > I think that is the way the driver is now. Go ahead. OK. The latest commits will be sent out very soon. Zhu Yanjun >> >> Best Regards, >> >> Zhu Yanjun >> >>> >>> Regards, >>> >>> Bob >>> >>>>> Bob >>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h >>>>>> index a222c3eeae12..f48f22f3353b 100644 >>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h >>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h >>>>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets { >>>>>> }; >>>>>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev); >>>>>> +void rxe_net_del(struct ib_device *dev); >>>>>> int rxe_register_notifier(void); >>>>>> int rxe_net_init(void); >
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 0ce6adb43cfc..ebfabc6d6b76 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ +static struct rdma_link_ops rxe_link_ops; int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) { rxe_init(rxe); rxe_set_mtu(rxe, mtu); + rxe->ib_dev.link_ops = &rxe_link_ops; return rxe_register_device(rxe, ibdev_name); } @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) return err; } -struct rdma_link_ops rxe_link_ops = { +static int rxe_dellink(struct ib_device *dev) +{ + rxe_net_del(dev); + + return 0; +} + +static struct rdma_link_ops rxe_link_ops = { .type = "rxe", .newlink = rxe_newlink, + .dellink = rxe_dellink, }; static int __init rxe_module_init(void) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 3ca92e062800..4cc7de7b115b 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) return 0; } +#define SK_REF_FOR_TUNNEL 2 +void rxe_net_del(struct ib_device *dev) +{ + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) + __sock_put(recv_sockets.sk6->sk); + else + rxe_release_udp_tunnel(recv_sockets.sk6); + + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL) + __sock_put(recv_sockets.sk4->sk); + else + rxe_release_udp_tunnel(recv_sockets.sk4); +} +#undef SK_REF_FOR_TUNNEL + static void rxe_port_event(struct rxe_dev *rxe, enum ib_event_type event) { @@ -689,8 +704,6 @@ int rxe_register_notifier(void) void rxe_net_exit(void) { - rxe_release_udp_tunnel(recv_sockets.sk6); - rxe_release_udp_tunnel(recv_sockets.sk4); unregister_netdevice_notifier(&rxe_net_notifier); } diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h index a222c3eeae12..f48f22f3353b 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.h +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -17,6 +17,7 @@ struct rxe_recv_sockets { }; int rxe_net_add(const char *ibdev_name, struct net_device *ndev); +void rxe_net_del(struct ib_device *dev); int rxe_register_notifier(void); int rxe_net_init(void);