diff mbox series

[05/10] RDMA/device: Add ib_device_get_by_netdev()

Message ID 20190213041256.22437-6-jgg@ziepe.ca (mailing list archive)
State Accepted
Delegated to: Jason Gunthorpe
Headers show
Series Revise device handling in rxe | expand

Commit Message

Jason Gunthorpe Feb. 13, 2019, 4:12 a.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

Several drivers need to find the ib_device from a given netdev. rxe needs
this at speed in an unsleepable context, so choose to implement the
translation using a RCU safe hash table.

The hash table can have a many to one mapping. This is intended to support
some future case where multiple IB drivers (ie iWarp and RoCE) connect to
the same netdevs. driver_ids will need to be different to support this.

In the process this makes the struct ib_device and ib_port_data RCU safe
by deferring their kfrees.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c | 119 +++++++++++++++++++++++++++----
 include/rdma/ib_verbs.h          |  10 ++-
 2 files changed, 116 insertions(+), 13 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 14c91f9af6ccc9..ae70091c20c19f 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -40,6 +40,7 @@ 
 #include <linux/netdevice.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
+#include <linux/hashtable.h>
 #include <rdma/rdma_netlink.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
@@ -133,6 +134,10 @@  static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
 	     !xa_is_err(entry);                                                \
 	     (index)++, entry = xan_find_marked(xa, &(index), filter))
 
+/* RCU hash table mapping netdevice pointers to struct ib_port_data */
+static DEFINE_SPINLOCK(ndev_hash_lock);
+static DECLARE_HASHTABLE(ndev_hash, 5);
+
 static void free_netdevs(struct ib_device *ib_dev);
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
 			      void *lsm_data);
@@ -143,6 +148,12 @@  static struct notifier_block ibdev_lsm_nb = {
 	.notifier_call = ib_security_change,
 };
 
+/* Pointer to the RCU head at the start of the ib_port_data array */
+struct ib_port_data_rcu {
+	struct rcu_head rcu_head;
+	struct ib_port_data pdata[];
+};
+
 static int ib_device_check_mandatory(struct ib_device *device)
 {
 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
@@ -294,9 +305,12 @@  static void ib_device_release(struct device *device)
 	WARN_ON(refcount_read(&dev->refcount));
 	ib_cache_release_one(dev);
 	ib_security_release_port_pkey_list(dev);
-	kfree(dev->port_data);
 	xa_destroy(&dev->client_data);
-	kfree(dev);
+	if (dev->port_data)
+		kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
+				       pdata[0]),
+			  rcu_head);
+	kfree_rcu(dev, rcu_head);
 }
 
 static int ib_device_uevent(struct device *device,
@@ -461,6 +475,7 @@  static void remove_client_context(struct ib_device *device,
 
 static int alloc_port_data(struct ib_device *device)
 {
+	struct ib_port_data_rcu *pdata_rcu;
 	unsigned int port;
 
 	if (device->port_data)
@@ -477,17 +492,26 @@  static int alloc_port_data(struct ib_device *device)
 	 * Therefore port_data is declared as a 1 based array with potential
 	 * empty slots at the beginning.
 	 */
-	device->port_data = kcalloc(rdma_end_port(device) + 1,
-				    sizeof(*device->port_data), GFP_KERNEL);
-	if (!device->port_data)
+	pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
+					rdma_end_port(device) + 1),
+			    GFP_KERNEL);
+	if (!pdata_rcu)
 		return -ENOMEM;
+	/*
+	 * The rcu_head is put in front of the port data array and the stored
+	 * pointer is adjusted since we never need to see that member until
+	 * kfree_rcu.
+	 */
+	device->port_data = pdata_rcu->pdata;
 
 	rdma_for_each_port (device, port) {
 		struct ib_port_data *pdata = &device->port_data[port];
 
+		pdata->ib_dev = device;
 		spin_lock_init(&pdata->pkey_list_lock);
 		INIT_LIST_HEAD(&pdata->pkey_list);
 		spin_lock_init(&pdata->netdev_lock);
+		INIT_HLIST_NODE(&pdata->ndev_hash_link);
 	}
 	return 0;
 }
@@ -1028,6 +1052,29 @@  int ib_query_port(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_query_port);
 
+static void add_ndev_hash(struct ib_port_data *pdata)
+{
+	unsigned long flags;
+
+	might_sleep();
+
+	spin_lock_irqsave(&ndev_hash_lock, flags);
+	if (hash_hashed(&pdata->ndev_hash_link)) {
+		hash_del_rcu(&pdata->ndev_hash_link);
+		spin_unlock_irqrestore(&ndev_hash_lock, flags);
+		/*
+		 * We cannot do hash_add_rcu after a hash_del_rcu until the
+		 * grace period
+		 */
+		synchronize_rcu();
+		spin_lock_irqsave(&ndev_hash_lock, flags);
+	}
+	if (pdata->netdev)
+		hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
+			     (uintptr_t)pdata->netdev);
+	spin_unlock_irqrestore(&ndev_hash_lock, flags);
+}
+
 /**
  * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
  * @ib_dev: Device to modify
@@ -1064,17 +1111,19 @@  int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
 
 	pdata = &ib_dev->port_data[port];
 	spin_lock_irqsave(&pdata->netdev_lock, flags);
-	if (pdata->netdev == ndev) {
+	old_ndev = rcu_dereference_protected(
+		pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+	if (old_ndev == ndev) {
 		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
 		return 0;
 	}
-	old_ndev = pdata->netdev;
 
 	if (ndev)
 		dev_hold(ndev);
-	pdata->netdev = ndev;
+	rcu_assign_pointer(pdata->netdev, ndev);
 	spin_unlock_irqrestore(&pdata->netdev_lock, flags);
 
+	add_ndev_hash(pdata);
 	if (old_ndev)
 		dev_put(old_ndev);
 
@@ -1089,11 +1138,24 @@  static void free_netdevs(struct ib_device *ib_dev)
 
 	rdma_for_each_port (ib_dev, port) {
 		struct ib_port_data *pdata = &ib_dev->port_data[port];
+		struct net_device *ndev;
 
 		spin_lock_irqsave(&pdata->netdev_lock, flags);
-		if (pdata->netdev) {
-			dev_put(pdata->netdev);
-			pdata->netdev = NULL;
+		ndev = rcu_dereference_protected(
+			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+		if (ndev) {
+			spin_lock(&ndev_hash_lock);
+			hash_del_rcu(&pdata->ndev_hash_link);
+			spin_unlock(&ndev_hash_lock);
+
+			/*
+			 * If this is the last dev_put there is still a
+			 * synchronize_rcu before the netdev is kfreed, so we
+			 * can continue to rely on unlocked pointer
+			 * comparisons after the put
+			 */
+			rcu_assign_pointer(pdata->netdev, NULL);
+			dev_put(ndev);
 		}
 		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
 	}
@@ -1118,7 +1180,8 @@  struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
 		res = ib_dev->ops.get_netdev(ib_dev, port);
 	else {
 		spin_lock(&pdata->netdev_lock);
-		res = pdata->netdev;
+		res = rcu_dereference_protected(
+			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
 		if (res)
 			dev_hold(res);
 		spin_unlock(&pdata->netdev_lock);
@@ -1136,6 +1199,38 @@  struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
 	return res;
 }
 
+/**
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
+ * @ndev: netdev to locate
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device that is associated with a netdev via
+ * ib_device_set_netdev(). The caller must call ib_device_put() on the
+ * returned pointer.
+ */
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id)
+{
+	struct ib_device *res = NULL;
+	struct ib_port_data *cur;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
+				    (uintptr_t)ndev) {
+		if (rcu_access_pointer(cur->netdev) == ndev &&
+		    (driver_id == RDMA_DRIVER_UNKNOWN ||
+		     cur->ib_dev->driver_id == driver_id) &&
+		    ib_device_try_get(cur->ib_dev)) {
+			res = cur->ib_dev;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return res;
+}
+EXPORT_SYMBOL(ib_device_get_by_netdev);
+
 /**
  * ib_enum_roce_netdev - enumerate all RoCE ports
  * @ib_dev : IB device we want to query
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 358bda4a76ff42..585512daef3cb2 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2198,6 +2198,8 @@  struct ib_port_immutable {
 };
 
 struct ib_port_data {
+	struct ib_device *ib_dev;
+
 	struct ib_port_immutable immutable;
 
 	spinlock_t pkey_list_lock;
@@ -2206,7 +2208,8 @@  struct ib_port_data {
 	struct ib_port_cache cache;
 
 	spinlock_t netdev_lock;
-	struct net_device *netdev;
+	struct net_device __rcu *netdev;
+	struct hlist_node ndev_hash_link;
 };
 
 /* rdma netdev type - specifies protocol type */
@@ -2543,6 +2546,7 @@  struct ib_device {
 	struct device                *dma_device;
 	struct ib_device_ops	     ops;
 	char                          name[IB_DEVICE_NAME_MAX];
+	struct rcu_head rcu_head;
 
 	struct list_head              event_handler_list;
 	spinlock_t                    event_handler_lock;
@@ -3997,6 +4001,10 @@  static inline bool ib_device_try_get(struct ib_device *dev)
 }
 
 void ib_device_put(struct ib_device *device);
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id);
+struct ib_device *ib_device_get_by_name(const char *name,
+					enum rdma_driver_id driver_id);
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
 					    u16 pkey, const union ib_gid *gid,
 					    const struct sockaddr *addr);