diff mbox series

[for-next,3/6] RDMA/rxe: Register IP mcast address

Message ID 20231103204324.9606-4-rpearsonhpe@gmail.com (mailing list archive)
State Deferred
Headers show
Series RDMA/rxe: Make multicast actually work | expand

Commit Message

Bob Pearson Nov. 3, 2023, 8:43 p.m. UTC
Add code to rxe_mcast_add() and rxe_mcast_del() to register/deregister
the IP multicast address. This is required for multicast traffic to
reach the rxe driver.

Fixes: 6090a0c4c7c6 ("RDMA/rxe: Cleanup rxe_mcast.c")
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_mcast.c | 110 +++++++++++++++++++++-----
 drivers/infiniband/sw/rxe/rxe_net.c   |   2 +-
 drivers/infiniband/sw/rxe/rxe_net.h   |   1 +
 drivers/infiniband/sw/rxe/rxe_verbs.h |   1 +
 4 files changed, 93 insertions(+), 21 deletions(-)

Comments

Zhu Yanjun Nov. 4, 2023, 12:42 p.m. UTC | #1
在 2023/11/4 4:43, Bob Pearson 写道:
> Add code to rxe_mcast_add() and rxe_mcast_del() to register/deregister
> the IP multicast address. This is required for multicast traffic to
> reach the rxe driver.
>
> Fixes: 6090a0c4c7c6 ("RDMA/rxe: Cleanup rxe_mcast.c")
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
>   drivers/infiniband/sw/rxe/rxe_mcast.c | 110 +++++++++++++++++++++-----
>   drivers/infiniband/sw/rxe/rxe_net.c   |   2 +-
>   drivers/infiniband/sw/rxe/rxe_net.h   |   1 +
>   drivers/infiniband/sw/rxe/rxe_verbs.h |   1 +
>   4 files changed, 93 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
> index 86cc2e18a7fd..ec757b955979 100644
> --- a/drivers/infiniband/sw/rxe/rxe_mcast.c
> +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
> @@ -19,38 +19,107 @@
>    * mcast packets in the rxe receive path.
>    */
>   
> +#include <linux/igmp.h>
> +
>   #include "rxe.h"
>   
> -/**
> - * rxe_mcast_add - add multicast address to rxe device
> - * @rxe: rxe device object
> - * @mgid: multicast address as a gid
> - *
> - * Returns 0 on success else an error
> - */
> -static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
> +/* register mcast IP and MAC addresses with net stack */
> +static int rxe_mcast_add6(struct rxe_dev *rxe, union ib_gid *mgid)
>   {
>   	unsigned char ll_addr[ETH_ALEN];
> +	struct in6_addr *addr6 = (struct in6_addr *)mgid;
> +	int err;


Using reverse fir tree, a.k.a. reverse Christmas tree or reverse XMAS 
tree, for

variable declarations isn't strictly required, though it is still preferred.

Zhu Yanjun


> +
> +	rtnl_lock();
> +	err = ipv6_sock_mc_join(recv_sockets.sk6->sk, rxe->ndev->ifindex,
> +				addr6);
> +	rtnl_unlock();
> +	if (err && err != -EADDRINUSE)
> +		goto err_out;
>   
>   	ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
> +	err = dev_mc_add(rxe->ndev, ll_addr);
> +	if (err)
> +		goto err_drop;
> +
> +	return 0;
>   
> -	return dev_mc_add(rxe->ndev, ll_addr);
> +err_drop:
> +	ipv6_sock_mc_drop(recv_sockets.sk6->sk, rxe->ndev->ifindex, addr6);
> +err_out:
> +	return err;
>   }
>   
> -/**
> - * rxe_mcast_del - delete multicast address from rxe device
> - * @rxe: rxe device object
> - * @mgid: multicast address as a gid
> - *
> - * Returns 0 on success else an error
> - */
> -static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid)
> +static int rxe_mcast_add(struct rxe_mcg *mcg)
>   {
> +	struct rxe_dev *rxe = mcg->rxe;
> +	union ib_gid *mgid = &mcg->mgid;
> +	struct ip_mreqn imr = {};
>   	unsigned char ll_addr[ETH_ALEN];
> +	int err;
> +
> +	if (mcg->is_ipv6)
> +		return rxe_mcast_add6(rxe, mgid);
> +
> +	imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12);
> +	imr.imr_ifindex = rxe->ndev->ifindex;
> +	rtnl_lock();
> +	err = ip_mc_join_group(recv_sockets.sk4->sk, &imr);
> +	rtnl_unlock();
> +	if (err && err != -EADDRINUSE)
> +		goto err_out;
> +
> +	ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr);
> +	err = dev_mc_add(rxe->ndev, ll_addr);
> +	if (err)
> +		goto err_leave;
> +
> +	return 0;
> +
> +err_leave:
> +	ip_mc_leave_group(recv_sockets.sk4->sk, &imr);
> +err_out:
> +	return err;
> +}
> +
> +/* deregister mcast IP and MAC addresses with net stack */
> +static int rxe_mcast_del6(struct rxe_dev *rxe, union ib_gid *mgid)
> +{
> +	unsigned char ll_addr[ETH_ALEN];
> +	int err, err2;
>   
>   	ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
> +	err = dev_mc_del(rxe->ndev, ll_addr);
> +
> +	rtnl_lock();
> +	err2 = ipv6_sock_mc_drop(recv_sockets.sk6->sk,
> +			rxe->ndev->ifindex, (struct in6_addr *)mgid);
> +	rtnl_unlock();
> +
> +	return err ?: err2;
> +}
> +
> +static int rxe_mcast_del(struct rxe_mcg *mcg)
> +{
> +	struct rxe_dev *rxe = mcg->rxe;
> +	union ib_gid *mgid = &mcg->mgid;
> +	struct ip_mreqn imr = {};
> +	unsigned char ll_addr[ETH_ALEN];
> +	int err, err2;
> +
> +	if (mcg->is_ipv6)
> +		return rxe_mcast_del6(rxe, mgid);
> +
> +	imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12);
> +	imr.imr_ifindex = rxe->ndev->ifindex;
> +	ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr);
> +	err = dev_mc_del(rxe->ndev, ll_addr);
> +
> +	rtnl_lock();
> +	err2 = ip_mc_leave_group(recv_sockets.sk4->sk, &imr);
> +	rtnl_unlock();
>   
> -	return dev_mc_del(rxe->ndev, ll_addr);
> +	return err ?: err2;
>   }
>   
>   /**
> @@ -164,6 +233,7 @@ static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid,
>   {
>   	kref_init(&mcg->ref_cnt);
>   	memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid));
> +	mcg->is_ipv6 = !ipv6_addr_v4mapped((struct in6_addr *)mgid);
>   	INIT_LIST_HEAD(&mcg->qp_list);
>   	mcg->rxe = rxe;
>   
> @@ -225,7 +295,7 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
>   	spin_unlock_bh(&rxe->mcg_lock);
>   
>   	/* add mcast address outside of lock */
> -	err = rxe_mcast_add(rxe, mgid);
> +	err = rxe_mcast_add(mcg);
>   	if (!err)
>   		return mcg;
>   
> @@ -273,7 +343,7 @@ static void __rxe_destroy_mcg(struct rxe_mcg *mcg)
>   static void rxe_destroy_mcg(struct rxe_mcg *mcg)
>   {
>   	/* delete mcast address outside of lock */
> -	rxe_mcast_del(mcg->rxe, &mcg->mgid);
> +	rxe_mcast_del(mcg);
>   
>   	spin_lock_bh(&mcg->rxe->mcg_lock);
>   	__rxe_destroy_mcg(mcg);
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index 2fad56fc95e7..36617d07fddf 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -18,7 +18,7 @@
>   #include "rxe_net.h"
>   #include "rxe_loc.h"
>   
> -static struct rxe_recv_sockets recv_sockets;
> +struct rxe_recv_sockets recv_sockets;
>   
>   static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
>   					 struct net_device *ndev,
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
> index 45d80d00f86b..89cee7d5340f 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.h
> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
> @@ -15,6 +15,7 @@ struct rxe_recv_sockets {
>   	struct socket *sk4;
>   	struct socket *sk6;
>   };
> +extern struct rxe_recv_sockets recv_sockets;
>   
>   int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index ccb9d19ffe8a..7be9e6232dd9 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -352,6 +352,7 @@ struct rxe_mcg {
>   	atomic_t		qp_num;
>   	u32			qkey;
>   	u16			pkey;
> +	bool			is_ipv6;
>   };
>   
>   struct rxe_mca {
Bob Pearson Nov. 5, 2023, 8:19 p.m. UTC | #2
On 11/4/23 07:42, Zhu Yanjun wrote:

> 
> Using reverse fir tree, a.k.a. reverse Christmas tree or reverse XMAS 
> tree, for
> 
> variable declarations isn't strictly required, though it is still 
> preferred.
> 
> Zhu Yanjun
> 
> 
Yeah. I usually follow that style for new code (except if there are
dependencies) but mostly add new variables at the end of the list
together  because it makes the patch simpler to read. At least it
does for me. If you care, I am happy to fix this.

Bob
Zhu Yanjun Nov. 6, 2023, 1:26 p.m. UTC | #3
在 2023/11/6 4:19, Bob Pearson 写道:
>
>
> On 11/4/23 07:42, Zhu Yanjun wrote:
>
>>
>> Using reverse fir tree, a.k.a. reverse Christmas tree or reverse XMAS 
>> tree, for
>>
>> variable declarations isn't strictly required, though it is still 
>> preferred.
>>
>> Zhu Yanjun
>>
>>
> Yeah. I usually follow that style for new code (except if there are
> dependencies) but mostly add new variables at the end of the list
> together  because it makes the patch simpler to read. At least it
> does for me. If you care, I am happy to fix this.

Yes. It is good to fix it.

And your commits add mcast address supports. And I think you

should have the test case in the rdma-core to verify these commits.

Can you share the test case in the rdma maillist? ^_^

Zhu Yanjun

>
> Bob
Bob Pearson Nov. 6, 2023, 5:31 p.m. UTC | #4
On 11/6/23 07:26, Zhu Yanjun wrote:
> 
> 在 2023/11/6 4:19, Bob Pearson 写道:
>>
>>
>> On 11/4/23 07:42, Zhu Yanjun wrote:
>>
>>>
>>> Using reverse fir tree, a.k.a. reverse Christmas tree or reverse XMAS 
>>> tree, for
>>>
>>> variable declarations isn't strictly required, though it is still 
>>> preferred.
>>>
>>> Zhu Yanjun
>>>
>>>
>> Yeah. I usually follow that style for new code (except if there are
>> dependencies) but mostly add new variables at the end of the list
>> together  because it makes the patch simpler to read. At least it
>> does for me. If you care, I am happy to fix this.
> 
> Yes. It is good to fix it.
> 
> And your commits add mcast address supports. And I think you
> 
> should have the test case in the rdma-core to verify these commits.
> 
> Can you share the test case in the rdma maillist? ^_^
> 
> Zhu Yanjun
> 
>>
>> Bob

I could share it but it's not really in a good shape to publish. I
have to modify the limits in rxe_param.h to test max_etc. And currently
I need to hand edit the send/recv versions to do node to node. In other
words just enough to (by hand) work through the use cases enough to
convince myself it works using ip maddr and wireshark along with the
program.

What you are asking for is a bunch of work to make the test program
more like iperf or ib_send_bw. Ideally it should either reload the
driver or do something else to let each test case be a clean start.

In an ideal world there would be a two node version of pyverbs. :-)

Bob
Zhu Yanjun Nov. 8, 2023, 1:24 a.m. UTC | #5
在 2023/11/7 1:31, Bob Pearson 写道:
> 
> 
> On 11/6/23 07:26, Zhu Yanjun wrote:
>>
>> 在 2023/11/6 4:19, Bob Pearson 写道:
>>>
>>>
>>> On 11/4/23 07:42, Zhu Yanjun wrote:
>>>
>>>>
>>>> Using reverse fir tree, a.k.a. reverse Christmas tree or reverse 
>>>> XMAS tree, for
>>>>
>>>> variable declarations isn't strictly required, though it is still 
>>>> preferred.
>>>>
>>>> Zhu Yanjun
>>>>
>>>>
>>> Yeah. I usually follow that style for new code (except if there are
>>> dependencies) but mostly add new variables at the end of the list
>>> together  because it makes the patch simpler to read. At least it
>>> does for me. If you care, I am happy to fix this.
>>
>> Yes. It is good to fix it.
>>
>> And your commits add mcast address supports. And I think you
>>
>> should have the test case in the rdma-core to verify these commits.
>>
>> Can you share the test case in the rdma maillist? ^_^
>>
>> Zhu Yanjun
>>
>>>
>>> Bob
> 
> I could share it but it's not really in a good shape to publish. I
> have to modify the limits in rxe_param.h to test max_etc. And currently
> I need to hand edit the send/recv versions to do node to node. In other
> words just enough to (by hand) work through the use cases enough to
> convince myself it works using ip maddr and wireshark along with the
> program.
> 
> What you are asking for is a bunch of work to make the test program
> more like iperf or ib_send_bw. Ideally it should either reload the
> driver or do something else to let each test case be a clean start.

Got it.
Anyway, a test case in rdma-core is needed to make tests with this feature.

And this feature is related with mcast. So please also send these 
commits to NETDEV maillist. NETDEV people can also give us a lot of good 
advice.

Thanks,
Zhu Yanjun

> 
> In an ideal world there would be a two node version of pyverbs. :-)
> 
> Bob
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
index 86cc2e18a7fd..ec757b955979 100644
--- a/drivers/infiniband/sw/rxe/rxe_mcast.c
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -19,38 +19,107 @@ 
  * mcast packets in the rxe receive path.
  */
 
+#include <linux/igmp.h>
+
 #include "rxe.h"
 
-/**
- * rxe_mcast_add - add multicast address to rxe device
- * @rxe: rxe device object
- * @mgid: multicast address as a gid
- *
- * Returns 0 on success else an error
- */
-static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+/* register mcast IP and MAC addresses with net stack */
+static int rxe_mcast_add6(struct rxe_dev *rxe, union ib_gid *mgid)
 {
 	unsigned char ll_addr[ETH_ALEN];
+	struct in6_addr *addr6 = (struct in6_addr *)mgid;
+	int err;
+
+	rtnl_lock();
+	err = ipv6_sock_mc_join(recv_sockets.sk6->sk, rxe->ndev->ifindex,
+				addr6);
+	rtnl_unlock();
+	if (err && err != -EADDRINUSE)
+		goto err_out;
 
 	ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+	err = dev_mc_add(rxe->ndev, ll_addr);
+	if (err)
+		goto err_drop;
+
+	return 0;
 
-	return dev_mc_add(rxe->ndev, ll_addr);
+err_drop:
+	ipv6_sock_mc_drop(recv_sockets.sk6->sk, rxe->ndev->ifindex, addr6);
+err_out:
+	return err;
 }
 
-/**
- * rxe_mcast_del - delete multicast address from rxe device
- * @rxe: rxe device object
- * @mgid: multicast address as a gid
- *
- * Returns 0 on success else an error
- */
-static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid)
+static int rxe_mcast_add(struct rxe_mcg *mcg)
 {
+	struct rxe_dev *rxe = mcg->rxe;
+	union ib_gid *mgid = &mcg->mgid;
+	struct ip_mreqn imr = {};
 	unsigned char ll_addr[ETH_ALEN];
+	int err;
+
+	if (mcg->is_ipv6)
+		return rxe_mcast_add6(rxe, mgid);
+
+	imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12);
+	imr.imr_ifindex = rxe->ndev->ifindex;
+	rtnl_lock();
+	err = ip_mc_join_group(recv_sockets.sk4->sk, &imr);
+	rtnl_unlock();
+	if (err && err != -EADDRINUSE)
+		goto err_out;
+
+	ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr);
+	err = dev_mc_add(rxe->ndev, ll_addr);
+	if (err)
+		goto err_leave;
+
+	return 0;
+
+err_leave:
+	ip_mc_leave_group(recv_sockets.sk4->sk, &imr);
+err_out:
+	return err;
+}
+
+/* deregister mcast IP and MAC addresses with net stack */
+static int rxe_mcast_del6(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+	unsigned char ll_addr[ETH_ALEN];
+	int err, err2;
 
 	ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+	err = dev_mc_del(rxe->ndev, ll_addr);
+
+	rtnl_lock();
+	err2 = ipv6_sock_mc_drop(recv_sockets.sk6->sk,
+			rxe->ndev->ifindex, (struct in6_addr *)mgid);
+	rtnl_unlock();
+
+	return err ?: err2;
+}
+
+static int rxe_mcast_del(struct rxe_mcg *mcg)
+{
+	struct rxe_dev *rxe = mcg->rxe;
+	union ib_gid *mgid = &mcg->mgid;
+	struct ip_mreqn imr = {};
+	unsigned char ll_addr[ETH_ALEN];
+	int err, err2;
+
+	if (mcg->is_ipv6)
+		return rxe_mcast_del6(rxe, mgid);
+
+	imr.imr_multiaddr = *(struct in_addr *)(mgid->raw + 12);
+	imr.imr_ifindex = rxe->ndev->ifindex;
+	ip_eth_mc_map(imr.imr_multiaddr.s_addr, ll_addr);
+	err = dev_mc_del(rxe->ndev, ll_addr);
+
+	rtnl_lock();
+	err2 = ip_mc_leave_group(recv_sockets.sk4->sk, &imr);
+	rtnl_unlock();
 
-	return dev_mc_del(rxe->ndev, ll_addr);
+	return err ?: err2;
 }
 
 /**
@@ -164,6 +233,7 @@  static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid,
 {
 	kref_init(&mcg->ref_cnt);
 	memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid));
+	mcg->is_ipv6 = !ipv6_addr_v4mapped((struct in6_addr *)mgid);
 	INIT_LIST_HEAD(&mcg->qp_list);
 	mcg->rxe = rxe;
 
@@ -225,7 +295,7 @@  static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
 	spin_unlock_bh(&rxe->mcg_lock);
 
 	/* add mcast address outside of lock */
-	err = rxe_mcast_add(rxe, mgid);
+	err = rxe_mcast_add(mcg);
 	if (!err)
 		return mcg;
 
@@ -273,7 +343,7 @@  static void __rxe_destroy_mcg(struct rxe_mcg *mcg)
 static void rxe_destroy_mcg(struct rxe_mcg *mcg)
 {
 	/* delete mcast address outside of lock */
-	rxe_mcast_del(mcg->rxe, &mcg->mgid);
+	rxe_mcast_del(mcg);
 
 	spin_lock_bh(&mcg->rxe->mcg_lock);
 	__rxe_destroy_mcg(mcg);
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 2fad56fc95e7..36617d07fddf 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -18,7 +18,7 @@ 
 #include "rxe_net.h"
 #include "rxe_loc.h"
 
-static struct rxe_recv_sockets recv_sockets;
+struct rxe_recv_sockets recv_sockets;
 
 static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
 					 struct net_device *ndev,
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index 45d80d00f86b..89cee7d5340f 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -15,6 +15,7 @@  struct rxe_recv_sockets {
 	struct socket *sk4;
 	struct socket *sk6;
 };
+extern struct rxe_recv_sockets recv_sockets;
 
 int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
 
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index ccb9d19ffe8a..7be9e6232dd9 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -352,6 +352,7 @@  struct rxe_mcg {
 	atomic_t		qp_num;
 	u32			qkey;
 	u16			pkey;
+	bool			is_ipv6;
 };
 
 struct rxe_mca {