diff mbox series

[v7,5/5] rdma_rxe: use netlink messages to add/delete links

Message ID 454cee5c5751111e0705510d773b71be5cdcb751.1544826079.git.swise@opengridcomputing.com (mailing list archive)
State Superseded
Headers show
Series Dynamic rdma link creation | expand

Commit Message

Steve Wise Dec. 14, 2018, 4:05 p.m. UTC
Add support for the RDMA_NLDEV_CMD_NEWLINK/DELLINK messages which allow
dynamically adding new RXE links.  Deprecate the old module options
for now.

Cc: Moni Shoua <monis@mellanox.com>
Reviewed-by: Yanjun Zhu <yanjun.zhu@oracle.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/sw/rxe/rxe.c       | 39 +++++++++++++++++++++++++++++++++--
 drivers/infiniband/sw/rxe/rxe.h       |  2 +-
 drivers/infiniband/sw/rxe/rxe_net.c   |  4 ++--
 drivers/infiniband/sw/rxe/rxe_net.h   |  2 +-
 drivers/infiniband/sw/rxe/rxe_param.h |  3 ++-
 drivers/infiniband/sw/rxe/rxe_sysfs.c |  6 +++---
 drivers/infiniband/sw/rxe/rxe_verbs.c |  4 ++--
 drivers/infiniband/sw/rxe/rxe_verbs.h |  2 +-
 8 files changed, 49 insertions(+), 13 deletions(-)

Comments

Jason Gunthorpe Dec. 18, 2018, 8:40 p.m. UTC | #1
On Fri, Dec 14, 2018 at 08:05:57AM -0800, Steve Wise wrote:
> Add support for the RDMA_NLDEV_CMD_NEWLINK/DELLINK messages which allow
> dynamically adding new RXE links.  Deprecate the old module options
> for now.
> 
> Cc: Moni Shoua <monis@mellanox.com>
> Reviewed-by: Yanjun Zhu <yanjun.zhu@oracle.com>
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>  drivers/infiniband/sw/rxe/rxe.c       | 39 +++++++++++++++++++++++++++++++++--
>  drivers/infiniband/sw/rxe/rxe.h       |  2 +-
>  drivers/infiniband/sw/rxe/rxe_net.c   |  4 ++--
>  drivers/infiniband/sw/rxe/rxe_net.h   |  2 +-
>  drivers/infiniband/sw/rxe/rxe_param.h |  3 ++-
>  drivers/infiniband/sw/rxe/rxe_sysfs.c |  6 +++---
>  drivers/infiniband/sw/rxe/rxe_verbs.c |  4 ++--
>  drivers/infiniband/sw/rxe/rxe_verbs.h |  2 +-
>  8 files changed, 49 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 91ad339fd6e1..07fea762f0ea 100644
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -31,6 +31,7 @@
>   * SOFTWARE.
>   */
>  
> +#include <rdma/rdma_netlink.h>
>  #include <net/addrconf.h>
>  #include "rxe.h"
>  #include "rxe_loc.h"
> @@ -301,7 +302,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>  /* called by ifc layer to create new rxe device.
>   * The caller should allocate memory for rxe by calling ib_alloc_device.
>   */
> -int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
> +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>  {
>  	int err;
>  
> @@ -311,9 +312,39 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
>  
>  	rxe_set_mtu(rxe, mtu);
>  
> -	return rxe_register_device(rxe);
> +	return rxe_register_device(rxe, ibdev_name);
>  }
>  
> +static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
> +{
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
> +	int err = 0;
> +
> +	if (rxe) {
> +		pr_err("already configured on %s\n", ndev->name);
> +		err = -EEXIST;
> +		ib_device_put(&rxe->ib_dev);
> +		goto err;
> +	}
> +
> +	rxe = rxe_net_add(ibdev_name, ndev);
> +	if (!rxe) {
> +		pr_err("failed to add %s\n", ndev->name);
> +		err = -EINVAL;
> +		goto err;
> +	}
> +
> +	rxe_set_port_state(ndev, rxe);

Hurm. So there is another pre-existing problem here..

ib_register_device creates a device and that immediately becomes
available to be unregistered - ie this can race with a netdev notifier
tearing down the netdev.

So, rxe_register_device/net_add can't return a pointer and we can't
call rxe_set_port_state like this.

rxe_set_port_state has to be done before registration or as part of
some kind of core code callback setting up the device, or we have to
do more stuff with refcounting.

Otherwise I think the two netlink patches are looking great now. It is
unfortunate Parav discovered my patch is all busted up. :(

Jason
Steve Wise Dec. 18, 2018, 8:52 p.m. UTC | #2
On 12/18/2018 2:40 PM, Jason Gunthorpe wrote:
> On Fri, Dec 14, 2018 at 08:05:57AM -0800, Steve Wise wrote:
>> Add support for the RDMA_NLDEV_CMD_NEWLINK/DELLINK messages which allow
>> dynamically adding new RXE links.  Deprecate the old module options
>> for now.
>>
>> Cc: Moni Shoua <monis@mellanox.com>
>> Reviewed-by: Yanjun Zhu <yanjun.zhu@oracle.com>
>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>>  drivers/infiniband/sw/rxe/rxe.c       | 39 +++++++++++++++++++++++++++++++++--
>>  drivers/infiniband/sw/rxe/rxe.h       |  2 +-
>>  drivers/infiniband/sw/rxe/rxe_net.c   |  4 ++--
>>  drivers/infiniband/sw/rxe/rxe_net.h   |  2 +-
>>  drivers/infiniband/sw/rxe/rxe_param.h |  3 ++-
>>  drivers/infiniband/sw/rxe/rxe_sysfs.c |  6 +++---
>>  drivers/infiniband/sw/rxe/rxe_verbs.c |  4 ++--
>>  drivers/infiniband/sw/rxe/rxe_verbs.h |  2 +-
>>  8 files changed, 49 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>> index 91ad339fd6e1..07fea762f0ea 100644
>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>> @@ -31,6 +31,7 @@
>>   * SOFTWARE.
>>   */
>>  
>> +#include <rdma/rdma_netlink.h>
>>  #include <net/addrconf.h>
>>  #include "rxe.h"
>>  #include "rxe_loc.h"
>> @@ -301,7 +302,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>  /* called by ifc layer to create new rxe device.
>>   * The caller should allocate memory for rxe by calling ib_alloc_device.
>>   */
>> -int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
>> +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>  {
>>  	int err;
>>  
>> @@ -311,9 +312,39 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
>>  
>>  	rxe_set_mtu(rxe, mtu);
>>  
>> -	return rxe_register_device(rxe);
>> +	return rxe_register_device(rxe, ibdev_name);
>>  }
>>  
>> +static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>> +{
>> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
>> +	int err = 0;
>> +
>> +	if (rxe) {
>> +		pr_err("already configured on %s\n", ndev->name);
>> +		err = -EEXIST;
>> +		ib_device_put(&rxe->ib_dev);
>> +		goto err;
>> +	}
>> +
>> +	rxe = rxe_net_add(ibdev_name, ndev);
>> +	if (!rxe) {
>> +		pr_err("failed to add %s\n", ndev->name);
>> +		err = -EINVAL;
>> +		goto err;
>> +	}
>> +
>> +	rxe_set_port_state(ndev, rxe);
> Hurm. So there is another pre-existing problem here..
>
> ib_register_device creates a device and that immediately becomes
> available to be unregistered - ie this can race with a netdev notifier
> tearing down the netdev.
>
> So, rxe_register_device/net_add can't return a pointer and we can't
> call rxe_set_port_state like this.
>
> rxe_set_port_state has to be done before registration or as part of
> some kind of core code callback setting up the device, or we have to
> do more stuff with refcounting.


ib_register_device() could take a kref on the ib_device before it
becomes available to be unregistered.  Then the callers could be
required to ib_device_put() at the end of these sorts of operations.  Maybe?


> Otherwise I think the two netlink patches are looking great now. It is
> unfortunate Parav discovered my patch is all busted up. :(
Jason Gunthorpe Dec. 18, 2018, 8:55 p.m. UTC | #3
On Tue, Dec 18, 2018 at 02:52:40PM -0600, Steve Wise wrote:

> > Hurm. So there is another pre-existing problem here..
> >
> > ib_register_device creates a device and that immediately becomes
> > available to be unregistered - ie this can race with a netdev notifier
> > tearing down the netdev.
> >
> > So, rxe_register_device/net_add can't return a pointer and we can't
> > call rxe_set_port_state like this.
> >
> > rxe_set_port_state has to be done before registration or as part of
> > some kind of core code callback setting up the device, or we have to
> > do more stuff with refcounting.
> 
> ib_register_device() could take a kref on the ib_device before it
> becomes available to be unregistered.  Then the callers could be
> required to ib_device_put() at the end of these sorts of operations.  Maybe?

Maybe, we could also have a 'setup' callback like netdev does...

 Frankly I'm a bit suspicious that drivers touching the device after
registration are even right :)

Perhaps this is why the netdev version of newlink creates the netdev
on behalf of the client, so the core can provide enough locking
against dellink.. This is made more complicated by the unregister path
too..

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 91ad339fd6e1..07fea762f0ea 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -31,6 +31,7 @@ 
  * SOFTWARE.
  */
 
+#include <rdma/rdma_netlink.h>
 #include <net/addrconf.h>
 #include "rxe.h"
 #include "rxe_loc.h"
@@ -301,7 +302,7 @@  void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
 /* called by ifc layer to create new rxe device.
  * The caller should allocate memory for rxe by calling ib_alloc_device.
  */
-int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
 {
 	int err;
 
@@ -311,9 +312,39 @@  int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
 
 	rxe_set_mtu(rxe, mtu);
 
-	return rxe_register_device(rxe);
+	return rxe_register_device(rxe, ibdev_name);
 }
 
+static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
+{
+	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
+	int err = 0;
+
+	if (rxe) {
+		pr_err("already configured on %s\n", ndev->name);
+		err = -EEXIST;
+		ib_device_put(&rxe->ib_dev);
+		goto err;
+	}
+
+	rxe = rxe_net_add(ibdev_name, ndev);
+	if (!rxe) {
+		pr_err("failed to add %s\n", ndev->name);
+		err = -EINVAL;
+		goto err;
+	}
+
+	rxe_set_port_state(ndev, rxe);
+	pr_info("added %s to %s\n", rxe->ib_dev.name, ndev->name);
+err:
+	return err;
+}
+
+static struct rdma_link_ops rxe_link_ops = {
+	.type = "rxe",
+	.newlink = rxe_newlink,
+};
+
 static int __init rxe_module_init(void)
 {
 	int err;
@@ -329,12 +360,14 @@  static int __init rxe_module_init(void)
 	if (err)
 		return err;
 
+	rdma_link_register(&rxe_link_ops);
 	pr_info("loaded\n");
 	return 0;
 }
 
 static void __exit rxe_module_exit(void)
 {
+	rdma_link_unregister(&rxe_link_ops);
 	ib_unregister_driver(RDMA_DRIVER_RXE);
 	rxe_net_exit();
 	rxe_cache_exit();
@@ -344,3 +377,5 @@  static void __exit rxe_module_exit(void)
 
 late_initcall(rxe_module_init);
 module_exit(rxe_module_exit);
+
+MODULE_ALIAS_RDMA_LINK("rxe");
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
index 4ff33d921865..d1861b2067e5 100644
--- a/drivers/infiniband/sw/rxe/rxe.h
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -95,7 +95,7 @@  static inline u32 rxe_crc32(struct rxe_dev *rxe,
 
 void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
 
-int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name);
 
 void rxe_rcv(struct sk_buff *skb);
 
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index c05549d09268..d65c88ff949d 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -520,7 +520,7 @@  enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num)
 	return IB_LINK_LAYER_ETHERNET;
 }
 
-struct rxe_dev *rxe_net_add(struct net_device *ndev)
+struct rxe_dev *rxe_net_add(const char *ibdev_name, struct net_device *ndev)
 {
 	int err;
 	struct rxe_dev *rxe = NULL;
@@ -531,7 +531,7 @@  struct rxe_dev *rxe_net_add(struct net_device *ndev)
 	rxe->ib_dev.driver_unregister = rxe_unregistered;
 	rxe->ndev = ndev;
 
-	err = rxe_add(rxe, ndev->mtu);
+	err = rxe_add(rxe, ndev->mtu, ibdev_name);
 	if (err) {
 		rxe_unregistered(&rxe->ib_dev);
 		return NULL;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index 685aa98baf55..4f87be455b5e 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -43,7 +43,7 @@  struct rxe_recv_sockets {
 	struct socket *sk6;
 };
 
-struct rxe_dev *rxe_net_add(struct net_device *ndev);
+struct rxe_dev *rxe_net_add(const char *ibdev_name, struct net_device *ndev);
 
 int rxe_net_init(void);
 void rxe_net_exit(void);
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
index bdea899a58ac..1abed47ca221 100644
--- a/drivers/infiniband/sw/rxe/rxe_param.h
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -78,7 +78,8 @@  enum rxe_device_param {
 					| IB_DEVICE_SYS_IMAGE_GUID
 					| IB_DEVICE_RC_RNR_NAK_GEN
 					| IB_DEVICE_SRQ_RESIZE
-					| IB_DEVICE_MEM_MGT_EXTENSIONS,
+					| IB_DEVICE_MEM_MGT_EXTENSIONS
+					| IB_DEVICE_ALLOW_USER_UNREG,
 	RXE_MAX_SGE			= 32,
 	RXE_MAX_SGE_RD			= 32,
 	RXE_MAX_CQ			= 16384,
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
index de9f51056f52..4dbd004956c8 100644
--- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -82,7 +82,7 @@  static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
 		goto err;
 	}
 
-	rxe = rxe_net_add(ndev);
+	rxe = rxe_net_add("rxe%d", ndev);
 	if (!rxe) {
 		pr_err("failed to add %s\n", intf);
 		err = -EINVAL;
@@ -135,6 +135,6 @@  static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
 };
 
 module_param_cb(add, &rxe_add_ops, NULL, 0200);
-MODULE_PARM_DESC(add, "Create RXE device over network interface");
+MODULE_PARM_DESC(add, "DEPRECATED.  Create RXE device over network interface");
 module_param_cb(remove, &rxe_remove_ops, NULL, 0200);
-MODULE_PARM_DESC(remove, "Remove RXE device over network interface");
+MODULE_PARM_DESC(remove, "DEPRECATED.  Remove RXE device over network interface");
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 28beee8f5838..05ed6d8f46ea 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1165,7 +1165,7 @@  static ssize_t parent_show(struct device *device,
 	.attrs = rxe_dev_attributes,
 };
 
-int rxe_register_device(struct rxe_dev *rxe)
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
 {
 	int err;
 	struct ib_device *dev = &rxe->ib_dev;
@@ -1273,7 +1273,7 @@  int rxe_register_device(struct rxe_dev *rxe)
 
 	rdma_set_device_sysfs_group(dev, &rxe_attr_group);
 	dev->driver_id = RDMA_DRIVER_RXE;
-	err = ib_register_device(dev, "rxe%d", NULL);
+	err = ib_register_device(dev, ibdev_name, NULL);
 	if (err) {
 		pr_warn("%s failed with error %d\n", __func__, err);
 		goto err1;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index ce63cbf0d738..cae04e016d83 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -465,7 +465,7 @@  static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
 	return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
 }
 
-int rxe_register_device(struct rxe_dev *rxe);
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name);
 
 void rxe_mc_cleanup(struct rxe_pool_entry *arg);