Message ID | 20231207192907.10113-4-rpearsonhpe@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | RDMA/rxe: Make multicast work | expand |
在 2023/12/8 3:29, Bob Pearson 写道: > Currently the rdma_rxe driver does not receive mcast packets at all. > > Add code to rxe_mcast_add() and rxe_mcast_del() to register/deregister > the IP mcast address. This is required for mcast traffic to reach the > rxe driver when coming from an external source. > > Fixes: 8700e3e7c485 ("Soft RoCE driver") > Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> > --- > drivers/infiniband/sw/rxe/rxe_mcast.c | 119 +++++++++++++++++++++----- > drivers/infiniband/sw/rxe/rxe_net.c | 2 +- > drivers/infiniband/sw/rxe/rxe_net.h | 1 + > drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + > 4 files changed, 102 insertions(+), 21 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c > index 86cc2e18a7fd..5236761892dd 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mcast.c > +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c > @@ -19,38 +19,116 @@ > * mcast packets in the rxe receive path. > */ > > +#include <linux/igmp.h> > + > #include "rxe.h" > > -/** > - * rxe_mcast_add - add multicast address to rxe device > - * @rxe: rxe device object > - * @mgid: multicast address as a gid > - * > - * Returns 0 on success else an error > - */ > -static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) > +static int rxe_mcast_add6(struct rxe_dev *rxe, union ib_gid *mgid) > { > + struct in6_addr *addr6 = (struct in6_addr *)mgid; > + struct sock *sk = recv_sockets.sk6->sk; > unsigned char ll_addr[ETH_ALEN]; > + int err; > + > + lock_sock(sk); > + rtnl_lock(); > + err = ipv6_sock_mc_join(sk, rxe->ndev->ifindex, addr6); > + rtnl_unlock(); > + release_sock(sk); > + if (err && err != -EADDRINUSE) > + goto err_out; > > ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); > + err = dev_mc_add(rxe->ndev, ll_addr); > + if (err) > + goto err_drop; > + > + return 0; > > - return dev_mc_add(rxe->ndev, ll_addr); > +err_drop: > + lock_sock(sk); > + rtnl_lock(); > + ipv6_sock_mc_drop(sk, rxe->ndev->ifindex, addr6); > + rtnl_unlock(); > + release_sock(sk); > +err_out: > + return err; > } > > -/** > - * rxe_mcast_del - delete multicast address from rxe device > - * @rxe: rxe device object > - * @mgid: multicast address as a gid > - * > - * Returns 0 on success else an error > - */ > -static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) > +static int rxe_mcast_add(struct rxe_mcg *mcg) > { > + struct rxe_dev *rxe = mcg->rxe; > + union ib_gid *mgid = &mcg->mgid; > unsigned char ll_addr[ETH_ALEN]; > + struct ip_mreqn imr = {}; > + int err; > + > + if (mcg->is_ipv6) > + return rxe_mcast_add6(rxe, mgid); > + > + imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12); > + imr.imr_ifindex = rxe->ndev->ifindex; > + rtnl_lock(); > + err = ip_mc_join_group(recv_sockets.sk4->sk, &imr); > + rtnl_unlock(); Hi, David Ahern About the functions ip_mc_join_group, ipv6_sock_mc_drop, ipv6_sock_mc_drop and ip_mc_leave_group, Can you share your advice about them? In the following, lock_sock is used. Can you help us to check them? ./net/ipv4/devinet.c-634- lock_sock(sk); ./net/ipv4/devinet.c-635- if (join) ./net/ipv4/devinet.c:636: ret = ip_mc_join_group(sk, &mreq); ./net/ipv4/devinet.c-637- else ./net/ipv4/devinet.c-638- ret = ip_mc_leave_group(sk, &mreq); ./net/ipv4/devinet.c-639- release_sock(sk); Thanks a lot. Zhu Yanjun > + if (err && err != -EADDRINUSE) > + goto err_out; > + > + ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr); > + err = dev_mc_add(rxe->ndev, ll_addr); > + if (err) > + goto err_leave; > + > + return 0; > + > +err_leave: > + rtnl_lock(); > + ip_mc_leave_group(recv_sockets.sk4->sk, &imr); > + rtnl_unlock(); > +err_out: > + return err; > +} > + > +static int rxe_mcast_del6(struct rxe_dev *rxe, union ib_gid *mgid) > +{ > + struct sock *sk = recv_sockets.sk6->sk; > + unsigned char ll_addr[ETH_ALEN]; > + int err, err2; > > ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); > + err = dev_mc_del(rxe->ndev, ll_addr); > + > + lock_sock(sk); > + rtnl_lock(); > + err2 = ipv6_sock_mc_drop(sk, rxe->ndev->ifindex, > + (struct in6_addr *)mgid); > + rtnl_unlock(); > + release_sock(sk); > + > + return err ?: err2; > +} > + > +static int rxe_mcast_del(struct rxe_mcg *mcg) > +{ > + struct rxe_dev *rxe = mcg->rxe; > + union ib_gid *mgid = &mcg->mgid; > + unsigned char ll_addr[ETH_ALEN]; > + struct ip_mreqn imr = {}; > + int err, err2; > + > + if (mcg->is_ipv6) > + return rxe_mcast_del6(rxe, mgid); > + > + imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12); > + imr.imr_ifindex = rxe->ndev->ifindex; > + ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr); > + err = dev_mc_del(rxe->ndev, ll_addr); > + > + rtnl_lock(); > + err2 = ip_mc_leave_group(recv_sockets.sk4->sk, &imr); > + rtnl_unlock(); > > - return dev_mc_del(rxe->ndev, ll_addr); > + return err ?: err2; > } > > /** > @@ -164,6 +242,7 @@ static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, > { > kref_init(&mcg->ref_cnt); > memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid)); > + mcg->is_ipv6 = !ipv6_addr_v4mapped((struct in6_addr *)mgid); > INIT_LIST_HEAD(&mcg->qp_list); > mcg->rxe = rxe; > > @@ -225,7 +304,7 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) > spin_unlock_bh(&rxe->mcg_lock); > > /* add mcast address outside of lock */ > - err = rxe_mcast_add(rxe, mgid); > + err = rxe_mcast_add(mcg); > if (!err) > return mcg; > > @@ -273,7 +352,7 @@ static void __rxe_destroy_mcg(struct rxe_mcg *mcg) > static void rxe_destroy_mcg(struct rxe_mcg *mcg) > { > /* delete mcast address outside of lock */ > - rxe_mcast_del(mcg->rxe, &mcg->mgid); > + rxe_mcast_del(mcg); > > spin_lock_bh(&mcg->rxe->mcg_lock); > __rxe_destroy_mcg(mcg); > diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c > index 58c3f3759bf0..b481f8da2002 100644 > --- a/drivers/infiniband/sw/rxe/rxe_net.c > +++ b/drivers/infiniband/sw/rxe/rxe_net.c > @@ -18,7 +18,7 @@ > #include "rxe_net.h" > #include "rxe_loc.h" > > -static struct rxe_recv_sockets recv_sockets; > +struct rxe_recv_sockets recv_sockets; > > static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, > struct net_device *ndev, > diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h > index 45d80d00f86b..89cee7d5340f 100644 > --- a/drivers/infiniband/sw/rxe/rxe_net.h > +++ b/drivers/infiniband/sw/rxe/rxe_net.h > @@ -15,6 +15,7 @@ struct rxe_recv_sockets { > struct socket *sk4; > struct socket *sk6; > }; > +extern struct rxe_recv_sockets recv_sockets; > > int rxe_net_add(const char *ibdev_name, struct net_device *ndev); > > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h > index ccb9d19ffe8a..7be9e6232dd9 100644 > --- a/drivers/infiniband/sw/rxe/rxe_verbs.h > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h > @@ -352,6 +352,7 @@ struct rxe_mcg { > atomic_t qp_num; > u32 qkey; > u16 pkey; > + bool is_ipv6; > }; > > struct rxe_mca {
On 12/7/23 12:29 PM, Bob Pearson wrote: > diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c > index 86cc2e18a7fd..5236761892dd 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mcast.c > +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c > @@ -19,38 +19,116 @@ > * mcast packets in the rxe receive path. > */ > > +#include <linux/igmp.h> > + > #include "rxe.h" > > -/** > - * rxe_mcast_add - add multicast address to rxe device > - * @rxe: rxe device object > - * @mgid: multicast address as a gid > - * > - * Returns 0 on success else an error > - */ > -static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) > +static int rxe_mcast_add6(struct rxe_dev *rxe, union ib_gid *mgid) > { > + struct in6_addr *addr6 = (struct in6_addr *)mgid; > + struct sock *sk = recv_sockets.sk6->sk; > unsigned char ll_addr[ETH_ALEN]; > + int err; > + > + lock_sock(sk); > + rtnl_lock(); reverse the order. rtnl is always taken first, then socket lock. Actually, I think it would be better to avoid burying this logic in the rxe driver. Can you try using the setsockopt API? I think it is now usable within the kernel again after the refactoring for io_uring.
On 12/8/23 6:52 AM, Zhu Yanjun wrote: > Hi, David Ahern > > About the functions ip_mc_join_group, ipv6_sock_mc_drop, > ipv6_sock_mc_drop and ip_mc_leave_group, > > Can you share your advice about them? see my previous comment on this patch.
在 2023/12/8 23:50, David Ahern 写道: > On 12/7/23 12:29 PM, Bob Pearson wrote: >> diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c >> index 86cc2e18a7fd..5236761892dd 100644 >> --- a/drivers/infiniband/sw/rxe/rxe_mcast.c >> +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c >> @@ -19,38 +19,116 @@ >> * mcast packets in the rxe receive path. >> */ >> >> +#include <linux/igmp.h> >> + >> #include "rxe.h" >> >> -/** >> - * rxe_mcast_add - add multicast address to rxe device >> - * @rxe: rxe device object >> - * @mgid: multicast address as a gid >> - * >> - * Returns 0 on success else an error >> - */ >> -static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) >> +static int rxe_mcast_add6(struct rxe_dev *rxe, union ib_gid *mgid) >> { >> + struct in6_addr *addr6 = (struct in6_addr *)mgid; >> + struct sock *sk = recv_sockets.sk6->sk; >> unsigned char ll_addr[ETH_ALEN]; >> + int err; >> + >> + lock_sock(sk); >> + rtnl_lock(); > > reverse the order. rtnl is always taken first, then socket lock. > > Actually, I think it would be better to avoid burying this logic in the > rxe driver. Can you try using the setsockopt API? I think it is now > usable within the kernel again after the refactoring for io_uring. Thanks, David, Your helps are appreciated. Zhu Yanjun > > > >
On 12/8/23 09:50, David Ahern wrote: > On 12/7/23 12:29 PM, Bob Pearson wrote: >> diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c >> index 86cc2e18a7fd..5236761892dd 100644 >> --- a/drivers/infiniband/sw/rxe/rxe_mcast.c >> +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c >> @@ -19,38 +19,116 @@ >> * mcast packets in the rxe receive path. >> */ >> >> +#include <linux/igmp.h> >> + >> #include "rxe.h" >> >> -/** >> - * rxe_mcast_add - add multicast address to rxe device >> - * @rxe: rxe device object >> - * @mgid: multicast address as a gid >> - * >> - * Returns 0 on success else an error >> - */ >> -static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) >> +static int rxe_mcast_add6(struct rxe_dev *rxe, union ib_gid *mgid) >> { >> + struct in6_addr *addr6 = (struct in6_addr *)mgid; >> + struct sock *sk = recv_sockets.sk6->sk; >> unsigned char ll_addr[ETH_ALEN]; >> + int err; >> + >> + lock_sock(sk); >> + rtnl_lock(); > > reverse the order. rtnl is always taken first, then socket lock. > > Actually, I think it would be better to avoid burying this logic in the > rxe driver. Can you try using the setsockopt API? I think it is now > usable within the kernel again after the refactoring for io_uring. > > David, Thanks for looking at this. What is the in kernel setsock api? I'll give it a try but I am not sure what to call. Bob > >
On 12/10/23 5:56 PM, Bob Pearson wrote: > Thanks for looking at this. What is the in kernel setsock api? I'll give > it a try but I am not sure what to call. > do_sock_setsockopt is what io_uring uses. Start with it.
diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 86cc2e18a7fd..5236761892dd 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -19,38 +19,116 @@ * mcast packets in the rxe receive path. */ +#include <linux/igmp.h> + #include "rxe.h" -/** - * rxe_mcast_add - add multicast address to rxe device - * @rxe: rxe device object - * @mgid: multicast address as a gid - * - * Returns 0 on success else an error - */ -static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) +static int rxe_mcast_add6(struct rxe_dev *rxe, union ib_gid *mgid) { + struct in6_addr *addr6 = (struct in6_addr *)mgid; + struct sock *sk = recv_sockets.sk6->sk; unsigned char ll_addr[ETH_ALEN]; + int err; + + lock_sock(sk); + rtnl_lock(); + err = ipv6_sock_mc_join(sk, rxe->ndev->ifindex, addr6); + rtnl_unlock(); + release_sock(sk); + if (err && err != -EADDRINUSE) + goto err_out; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + err = dev_mc_add(rxe->ndev, ll_addr); + if (err) + goto err_drop; + + return 0; - return dev_mc_add(rxe->ndev, ll_addr); +err_drop: + lock_sock(sk); + rtnl_lock(); + ipv6_sock_mc_drop(sk, rxe->ndev->ifindex, addr6); + rtnl_unlock(); + release_sock(sk); +err_out: + return err; } -/** - * rxe_mcast_del - delete multicast address from rxe device - * @rxe: rxe device object - * @mgid: multicast address as a gid - * - * Returns 0 on success else an error - */ -static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) +static int rxe_mcast_add(struct rxe_mcg *mcg) { + struct rxe_dev *rxe = mcg->rxe; + union ib_gid *mgid = &mcg->mgid; unsigned char ll_addr[ETH_ALEN]; + struct ip_mreqn imr = {}; + int err; + + if (mcg->is_ipv6) + return rxe_mcast_add6(rxe, mgid); + + imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12); + imr.imr_ifindex = rxe->ndev->ifindex; + rtnl_lock(); + err = ip_mc_join_group(recv_sockets.sk4->sk, &imr); + rtnl_unlock(); + if (err && err != -EADDRINUSE) + goto err_out; + + ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr); + err = dev_mc_add(rxe->ndev, ll_addr); + if (err) + goto err_leave; + + return 0; + +err_leave: + rtnl_lock(); + ip_mc_leave_group(recv_sockets.sk4->sk, &imr); + rtnl_unlock(); +err_out: + return err; +} + +static int rxe_mcast_del6(struct rxe_dev *rxe, union ib_gid *mgid) +{ + struct sock *sk = recv_sockets.sk6->sk; + unsigned char ll_addr[ETH_ALEN]; + int err, err2; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + err = dev_mc_del(rxe->ndev, ll_addr); + + lock_sock(sk); + rtnl_lock(); + err2 = ipv6_sock_mc_drop(sk, rxe->ndev->ifindex, + (struct in6_addr *)mgid); + rtnl_unlock(); + release_sock(sk); + + return err ?: err2; +} + +static int rxe_mcast_del(struct rxe_mcg *mcg) +{ + struct rxe_dev *rxe = mcg->rxe; + union ib_gid *mgid = &mcg->mgid; + unsigned char ll_addr[ETH_ALEN]; + struct ip_mreqn imr = {}; + int err, err2; + + if (mcg->is_ipv6) + return rxe_mcast_del6(rxe, mgid); + + imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12); + imr.imr_ifindex = rxe->ndev->ifindex; + ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr); + err = dev_mc_del(rxe->ndev, ll_addr); + + rtnl_lock(); + err2 = ip_mc_leave_group(recv_sockets.sk4->sk, &imr); + rtnl_unlock(); - return dev_mc_del(rxe->ndev, ll_addr); + return err ?: err2; } /** @@ -164,6 +242,7 @@ static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, { kref_init(&mcg->ref_cnt); memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid)); + mcg->is_ipv6 = !ipv6_addr_v4mapped((struct in6_addr *)mgid); INIT_LIST_HEAD(&mcg->qp_list); mcg->rxe = rxe; @@ -225,7 +304,7 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) spin_unlock_bh(&rxe->mcg_lock); /* add mcast address outside of lock */ - err = rxe_mcast_add(rxe, mgid); + err = rxe_mcast_add(mcg); if (!err) return mcg; @@ -273,7 +352,7 @@ static void __rxe_destroy_mcg(struct rxe_mcg *mcg) static void rxe_destroy_mcg(struct rxe_mcg *mcg) { /* delete mcast address outside of lock */ - rxe_mcast_del(mcg->rxe, &mcg->mgid); + rxe_mcast_del(mcg); spin_lock_bh(&mcg->rxe->mcg_lock); __rxe_destroy_mcg(mcg); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 58c3f3759bf0..b481f8da2002 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -18,7 +18,7 @@ #include "rxe_net.h" #include "rxe_loc.h" -static struct rxe_recv_sockets recv_sockets; +struct rxe_recv_sockets recv_sockets; static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, struct net_device *ndev, diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h index 45d80d00f86b..89cee7d5340f 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.h +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -15,6 +15,7 @@ struct rxe_recv_sockets { struct socket *sk4; struct socket *sk6; }; +extern struct rxe_recv_sockets recv_sockets; int rxe_net_add(const char *ibdev_name, struct net_device *ndev); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index ccb9d19ffe8a..7be9e6232dd9 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -352,6 +352,7 @@ struct rxe_mcg { atomic_t qp_num; u32 qkey; u16 pkey; + bool is_ipv6; }; struct rxe_mca {
Currently the rdma_rxe driver does not receive mcast packets at all. Add code to rxe_mcast_add() and rxe_mcast_del() to register/deregister the IP mcast address. This is required for mcast traffic to reach the rxe driver when coming from an external source. Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- drivers/infiniband/sw/rxe/rxe_mcast.c | 119 +++++++++++++++++++++----- drivers/infiniband/sw/rxe/rxe_net.c | 2 +- drivers/infiniband/sw/rxe/rxe_net.h | 1 + drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 4 files changed, 102 insertions(+), 21 deletions(-)