diff mbox series

[rdma-next,6/8] RDMA/core: Implement compat device/sysfs tree in net namespace

Message ID 20190213172310.1681-7-leon@kernel.org (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series Register infiniband class as net namespace aware class | expand

Commit Message

Leon Romanovsky Feb. 13, 2019, 5:23 p.m. UTC
From: Parav Pandit <parav@mellanox.com>

Implement compatibility layer sysfs entries of ib_core so that non
init_net net namespaces can also discover rdma devices.

Each non init_net net namespace has ib_core_device created in it.
Such ib_core_device sysfs tree resembles rdma devices found in
init_net namespace.
This allows discovering rdma devices in multiple non init_net
net namespaces via sysfs entries and helpful to rdma-core userspace.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/core_priv.h |   3 +
 drivers/infiniband/core/device.c    | 289 +++++++++++++++++++++++++++-
 include/rdma/ib_verbs.h             |   6 +
 3 files changed, 294 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index eeabe9ca8427..7705aa6861b5 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -333,4 +333,7 @@  int roce_resolve_route_from_path(struct sa_path_rec *rec,
 				 const struct ib_gid_attr *attr);
 
 struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
+
+void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev,
+		       struct net *net);
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 35b4bfec91c3..0ab28ab801af 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,6 +38,8 @@ 
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <rdma/rdma_netlink.h>
@@ -100,6 +102,50 @@  static DECLARE_RWSEM(clients_rwsem);
  * be registered.
  */
 #define CLIENT_DATA_REGISTERED XA_MARK_1
+
+/**
+ * ib_compat_device - rdma compat device per net namespace
+ * @coredev:	IB core device
+ * @id:		xarray id to identify the compat device; same id as that of
+ *		net namespace xarray.
+ */
+struct ib_compat_device {
+	struct ib_core_device coredev;
+	u32 id; /* xarray id same as that of rdma net namespace */
+};
+
+/**
+ * rdma_dev_net - rdma net namespace metadata for a net
+ * @net:	Pointer to owner net namespace
+ * @id:		xarray id to identify the net namespace.
+ */
+struct rdma_dev_net {
+	possible_net_t net;
+	u32 id;
+};
+
+/*
+ * If netns is registered then the corresponding compat device must also
+ * be registered.
+ */
+#define NET_NS_REGISTERED XA_MARK_1
+
+static unsigned int rdma_dev_net_id;
+/*
+ * Shadow net namespace entries maintained in xarray, which are referred
+ * by net life cycle routines (init_net/exit_net) and device life cycle
+ * routines (reg_dev/unreg_dev).
+ * Without this shadow list, if device life cycle routines access the
+ * net stack's net ns list, it can miss out to consider a net ns whose
+ * init_net() is executed but entry is not part of the net ns list in
+ * setup_net().
+ */
+static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_TRACK_FREE);
+/*
+ * rwsem to protect accessing the rdma_nets xarray entries.
+ */
+static DECLARE_RWSEM(rdma_nets_rwsem);
+
 /*
  * xarray has this behavior where it won't iterate over NULL values stored in
  * allocated arrays.  So we need our own iterator to see all values stored in
@@ -226,6 +272,26 @@  static struct ib_device *__ib_device_get_by_name(const char *name)
 	return NULL;
 }
 
+static int rename_compat_devs(struct ib_device *device)
+{
+	struct ib_compat_device *cdev;
+	unsigned long index;
+	int ret = 0;
+
+	mutex_lock(&device->compat_devs_mutex);
+	xa_for_each (&device->compat_devs, index, cdev) {
+		ret = device_rename(&cdev->coredev.dev, dev_name(&device->dev));
+		if (ret) {
+			dev_warn(&cdev->coredev.dev,
+				 "Fail to rename compatdev to new name %s\n",
+				 dev_name(&device->dev));
+			break;
+		}
+	}
+	mutex_unlock(&device->compat_devs_mutex);
+	return ret;
+}
+
 int ib_device_rename(struct ib_device *ibdev, const char *name)
 {
 	int ret;
@@ -245,6 +311,7 @@  int ib_device_rename(struct ib_device *ibdev, const char *name)
 	if (ret)
 		goto out;
 	strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
+	ret = rename_compat_devs(ibdev);
 out:
 	up_write(&devices_rwsem);
 	return ret;
@@ -314,7 +381,10 @@  static int ib_device_uevent(struct device *device,
 
 static const void *net_namespace(struct device *d)
 {
-	return &init_net;
+	struct ib_core_device *coredev =
+			container_of(d, struct ib_core_device, dev);
+
+	return read_pnet(&coredev->rdma_net);
 }
 
 static struct class ib_class = {
@@ -325,8 +395,8 @@  static struct class ib_class = {
 	.namespace = net_namespace,
 };
 
-static void rdma_init_coredev(struct ib_core_device *coredev,
-			      struct ib_device *dev)
+void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev,
+		       struct net *net)
 {
 	/* This BUILD_BUG_ON is intended to catch layout change
 	 * of union of ib_core_device and device.
@@ -342,6 +412,7 @@  static void rdma_init_coredev(struct ib_core_device *coredev,
 	device_initialize(&coredev->dev);
 	coredev->owner = dev;
 	INIT_LIST_HEAD(&coredev->port_list);
+	write_pnet(&coredev->rdma_net, net);
 }
 
 /**
@@ -371,7 +442,7 @@  struct ib_device *_ib_alloc_device(size_t size)
 	}
 
 	device->groups[0] = &ib_dev_attr_group;
-	rdma_init_coredev(&device->coredev, device);
+	rdma_init_coredev(&device->coredev, device, &init_net);
 
 	INIT_LIST_HEAD(&device->event_handler_list);
 	spin_lock_init(&device->event_handler_lock);
@@ -381,6 +452,8 @@  struct ib_device *_ib_alloc_device(size_t size)
 	 */
 	xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
 	init_rwsem(&device->client_data_rwsem);
+	xa_init_flags(&device->compat_devs, XA_FLAGS_TRACK_FREE);
+	mutex_init(&device->compat_devs_mutex);
 	init_completion(&device->unreg_completion);
 
 	return device;
@@ -395,6 +468,7 @@  EXPORT_SYMBOL(_ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
+	WARN_ON(!xa_empty(&device->compat_devs));
 	WARN_ON(!xa_empty(&device->client_data));
 	WARN_ON(refcount_read(&device->refcount));
 	rdma_restrack_clean(device);
@@ -593,6 +667,182 @@  static int ib_security_change(struct notifier_block *nb, unsigned long event,
 	return NOTIFY_OK;
 }
 
+static void compatdev_release(struct device *dev)
+{
+	struct ib_compat_device *cdev =
+			container_of(dev, struct ib_compat_device, coredev.dev);
+
+	kfree(cdev);
+}
+
+static int add_one_compat_dev(struct ib_device *device,
+			      struct rdma_dev_net *rnet)
+{
+	struct ib_compat_device *cdev;
+	int ret;
+
+	/* create and add compat device in all namespaces other than
+	 * where it is currently bound to.
+	 */
+	if (net_eq(read_pnet(&rnet->net),
+		   read_pnet(&device->coredev.rdma_net)))
+		return 0;
+
+	/* Whichever path among init_net() or ib_register_device() takes the
+	 * compat_devs_mutex first, will be adding the compat devices. So if its
+	 * already added, don't add again.
+	 */
+	mutex_lock(&device->compat_devs_mutex);
+	cdev = xa_load(&device->compat_devs, rnet->id);
+	if (cdev) {
+		/* entry already exist for a net, no need to add again. */
+		ret = 0;
+		goto done;
+	}
+
+	cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+	if (!cdev) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	cdev->coredev.dev.parent = device->dev.parent;
+	cdev->id = rnet->id;
+
+	rdma_init_coredev(&cdev->coredev, device, read_pnet(&rnet->net));
+	cdev->coredev.dev.release = compatdev_release;
+	dev_set_name(&cdev->coredev.dev, "%s", dev_name(&device->dev));
+
+	ret = device_add(&cdev->coredev.dev);
+	if (ret)
+		goto add_err;
+
+	ret = xa_insert(&device->compat_devs, rnet->id, cdev, GFP_KERNEL);
+	if (ret)
+		goto insert_err;
+
+	mutex_unlock(&device->compat_devs_mutex);
+	return 0;
+
+insert_err:
+	device_del(&cdev->coredev.dev);
+add_err:
+	put_device(&cdev->coredev.dev);
+done:
+	mutex_unlock(&device->compat_devs_mutex);
+	return ret;
+}
+
+static void remove_one_compat_dev(struct ib_device *device, u32 id)
+{
+	struct ib_compat_device *cdev;
+
+	mutex_lock(&device->compat_devs_mutex);
+	cdev = xa_erase(&device->compat_devs, id);
+	mutex_unlock(&device->compat_devs_mutex);
+	if (cdev) {
+		device_del(&cdev->coredev.dev);
+		put_device(&cdev->coredev.dev);
+	}
+}
+
+static void remove_compat_devs(struct ib_device *device)
+{
+	struct ib_compat_device *cdev;
+	unsigned long index;
+
+	xa_for_each (&device->compat_devs, index, cdev)
+		remove_one_compat_dev(device, index);
+}
+
+static int add_compat_devs(struct ib_device *device)
+{
+	struct rdma_dev_net *rnet = NULL;
+	unsigned long index;
+	int ret = 0;
+
+	down_read(&rdma_nets_rwsem);
+	xa_for_each_marked (&rdma_nets, index, rnet, NET_NS_REGISTERED) {
+		ret = add_one_compat_dev(device, rnet);
+		if (ret)
+			break;
+	}
+	up_read(&rdma_nets_rwsem);
+	return ret;
+}
+
+static void rdma_dev_exit_net(struct net *net)
+{
+	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
+	struct ib_device *dev;
+	unsigned long index;
+
+	/* Clear the net for REGISTERED state, so that any new device
+	 * registration doesn't use this net to add compat devices.
+	 */
+	down_write(&rdma_nets_rwsem);
+	xa_clear_mark(&rdma_nets, rnet->id, NET_NS_REGISTERED);
+	up_write(&rdma_nets_rwsem);
+
+	down_read(&devices_rwsem);
+	xa_for_each (&devices, index, dev) {
+		/* hold device reference to not free the device
+		 * while working on removing its compat devices.
+		 */
+		get_device(&dev->dev);
+		/* Release the devices_rwsem so that pontentially
+		 * blocking device_del, doesn't hold the devices_rwsem
+		 * for too long.
+		 */
+		up_read(&devices_rwsem);
+		remove_one_compat_dev(dev, rnet->id);
+		put_device(&dev->dev);
+		down_read(&devices_rwsem);
+	}
+	up_read(&devices_rwsem);
+
+	down_write(&rdma_nets_rwsem);
+	xa_erase(&rdma_nets, rnet->id);
+	up_write(&rdma_nets_rwsem);
+}
+
+static __net_init int rdma_dev_init_net(struct net *net)
+{
+	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
+	unsigned long index;
+	struct ib_device *dev;
+	int ret;
+
+	/* No need to create any compat devices in default init_net. */
+	if (net_eq(net, &init_net))
+		return 0;
+
+	write_pnet(&rnet->net, net);
+
+	down_write(&rdma_nets_rwsem);
+	ret = xa_alloc(&rdma_nets, &rnet->id, U32_MAX, rnet, GFP_KERNEL);
+	if (!ret)
+		xa_set_mark(&rdma_nets, rnet->id, NET_NS_REGISTERED);
+	up_write(&rdma_nets_rwsem);
+	if (ret)
+		return ret;
+
+	/* Hold devices_rwsem to synchronize with disable_device(),
+	 * rename_device() so that we don't add a compat devices for a device
+	 * which may be undergoing unregistration sequence.
+	 */
+	down_read(&devices_rwsem);
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+		ret = add_one_compat_dev(dev, rnet);
+		if (ret)
+			break;
+	}
+	up_read(&devices_rwsem);
+
+	if (ret)
+		rdma_dev_exit_net(net);
+	return ret;
+}
+
 /*
  * Assign the unique string device name and the unique device index.
  */
@@ -810,8 +1060,18 @@  int ib_register_device(struct ib_device *device, const char *name)
 	if (ret)
 		goto sysfs_cleanup;
 
+	ret = add_compat_devs(device);
+	if (ret)
+		goto disable_cleanup;
+
 	return 0;
 
+disable_cleanup:
+	disable_device(device);
+	/* Cleanup any compat devices which are added by init_net() before
+	 * disabling the device.
+	 */
+	remove_compat_devs(device);
 sysfs_cleanup:
 	ib_device_unregister_sysfs(device);
 dev_cleanup:
@@ -834,6 +1094,11 @@  EXPORT_SYMBOL(ib_register_device);
 void ib_unregister_device(struct ib_device *device)
 {
 	disable_device(device);
+	/* compat devices must be removed after device refcount drops to zero.
+	 * Otherwise init_net() may add more compatdevs after removing
+	 * compat devices and before device is disabled.
+	 */
+	remove_compat_devs(device);
 	ib_device_unregister_sysfs(device);
 	device_del(&device->dev);
 	ib_device_unregister_rdmacg(device);
@@ -842,6 +1107,13 @@  void ib_unregister_device(struct ib_device *device)
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
+static struct pernet_operations rdma_dev_net_ops = {
+	.init = rdma_dev_init_net,
+	.exit = rdma_dev_exit_net,
+	.id = &rdma_dev_net_id,
+	.size = sizeof(struct rdma_dev_net),
+};
+
 static int assign_client_id(struct ib_client *client)
 {
 	int ret;
@@ -1531,12 +1803,20 @@  static int __init ib_core_init(void)
 		goto err_sa;
 	}
 
+	ret = register_pernet_device(&rdma_dev_net_ops);
+	if (ret) {
+		pr_warn("Couldn't init compat dev. ret %d\n", ret);
+		goto err_compat;
+	}
+
 	nldev_init();
 	rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
 	roce_gid_mgmt_init();
 
 	return 0;
 
+err_compat:
+	unregister_lsm_notifier(&ibdev_lsm_nb);
 err_sa:
 	ib_sa_cleanup();
 err_mad:
@@ -1561,6 +1841,7 @@  static void __exit ib_core_cleanup(void)
 	roce_gid_mgmt_cleanup();
 	nldev_exit();
 	rdma_nl_unregister(RDMA_NL_LS);
+	unregister_pernet_device(&rdma_dev_net_ops);
 	unregister_lsm_notifier(&ibdev_lsm_nb);
 	ib_sa_cleanup();
 	ib_mad_cleanup();
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 30314376d032..3e78551b0bfa 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2539,6 +2539,7 @@  struct ib_core_device {
 	 * union of ib_core_device and device exists in ib_device.
 	 */
 	struct device dev;
+	possible_net_t rdma_net;
 	struct kobject *ports_kobj;
 	struct list_head port_list;
 	struct ib_device *owner; /* reach back to owner ib_device */
@@ -2613,6 +2614,11 @@  struct ib_device {
 	 */
 	refcount_t refcount;
 	struct completion unreg_completion;
+
+	/* Protects compat_devs xarray modifications */
+	struct mutex compat_devs_mutex;
+	/* Maintains compat devices for each net namespace */
+	struct xarray compat_devs;
 };
 
 struct ib_client {