diff mbox series

[v7,1/5] verbs/rxe: Use core services to keep track of RXE things

Message ID d153e12624d4f461b12607d59068ea283e06887a.1544826079.git.swise@opengridcomputing.com (mailing list archive)
State Superseded
Headers show
Series Dynamic rdma link creation | expand

Commit Message

Jason Gunthorpe Dec. 14, 2018, 4:05 p.m. UTC
Replace
 - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
 - rxe_dev_list with core's device_list.
 - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
   lifetime management
 - Buggy get_rxe_by_name with ib_device_get_by_name

Add
 - ib_unregister_driver so drivers like rxe don't have to keep track
   of things to do module unload
 - ib_unregister_device_and_put to work with potiners returned by
   that various ib_device_get_* functions

The refcount change to the pool might need more work, but fundamentally
the driver cannot have krefs open in the pool across unregistration or it
contains module unlolad races.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/core_priv.h   |   1 -
 drivers/infiniband/core/device.c      | 213 ++++++++++++++++++++++++++++++++--
 drivers/infiniband/sw/rxe/rxe.c       |  38 +-----
 drivers/infiniband/sw/rxe/rxe.h       |  14 ++-
 drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
 drivers/infiniband/sw/rxe/rxe_net.c   |  83 +++----------
 drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
 drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 +++----
 drivers/infiniband/sw/rxe/rxe_verbs.c |   7 --
 drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
 include/rdma/ib_verbs.h               |  10 ++
 11 files changed, 261 insertions(+), 153 deletions(-)

Comments

Parav Pandit Dec. 15, 2018, 2:28 a.m. UTC | #1
> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Jason Gunthorpe
> Sent: Friday, December 14, 2018 10:06 AM
> To: dledford@redhat.com; Jason Gunthorpe <jgg@mellanox.com>
> Cc: linux-rdma@vger.kernel.org; BMT@zurich.ibm.com; leon@kernel.org;
> Mark Bloch <markb@mellanox.com>; yanjun.zhu@oracle.com
> Subject: [PATCH v7 1/5] verbs/rxe: Use core services to keep track of RXE
> things
> 
> Replace
>  - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
>  - rxe_dev_list with core's device_list.
>  - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
>    lifetime management
>  - Buggy get_rxe_by_name with ib_device_get_by_name
> 
> Add
>  - ib_unregister_driver so drivers like rxe don't have to keep track
>    of things to do module unload
>  - ib_unregister_device_and_put to work with potiners returned by
>    that various ib_device_get_* functions
> 
> The refcount change to the pool might need more work, but fundamentally
> the driver cannot have krefs open in the pool across unregistration or it
> contains module unlolad races.
> 
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> ---
>  drivers/infiniband/core/core_priv.h   |   1 -
>  drivers/infiniband/core/device.c      | 213
> ++++++++++++++++++++++++++++++++--
>  drivers/infiniband/sw/rxe/rxe.c       |  38 +-----
>  drivers/infiniband/sw/rxe/rxe.h       |  14 ++-
>  drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
>  drivers/infiniband/sw/rxe/rxe_net.c   |  83 +++----------
>  drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
>  drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 +++----
>  drivers/infiniband/sw/rxe/rxe_verbs.c |   7 --
>  drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
>  include/rdma/ib_verbs.h               |  10 ++
>  11 files changed, 261 insertions(+), 153 deletions(-)
> 
> diff --git a/drivers/infiniband/core/core_priv.h
> b/drivers/infiniband/core/core_priv.h
> index cc7535c5e192..1b2575430032 100644
> --- a/drivers/infiniband/core/core_priv.h
> +++ b/drivers/infiniband/core/core_priv.h
> @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct
> ib_mad_agent_private *map,  #endif
> 
>  struct ib_device *ib_device_get_by_index(u32 ifindex); -void
> ib_device_put(struct ib_device *device);
>  /* RDMA device netlink */
>  void nldev_init(void);
>  void nldev_exit(void);
> diff --git a/drivers/infiniband/core/device.c
> b/drivers/infiniband/core/device.c
> index 348a7fb1f945..a2523605e02a 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -85,6 +85,9 @@ struct ib_client_data {  static
> DEFINE_MUTEX(device_mutex);  static DECLARE_RWSEM(lists_rwsem);
> 
> +/* Used to serialize ib_unregister_driver */ static
> +DECLARE_RWSEM(unregister_rwsem);
> +
>  static int ib_security_change(struct notifier_block *nb, unsigned long event,
>  			      void *lsm_data);
>  static void ib_policy_change_task(struct work_struct *work); @@ -168,6
> +171,7 @@ void ib_device_put(struct ib_device *device)
>  	if (refcount_dec_and_test(&device->refcount))
>  		complete(&device->unreg_completion);
>  }
> +EXPORT_SYMBOL(ib_device_put);
> 
>  static struct ib_device *__ib_device_get_by_name(const char *name)  { @@
> -180,6 +184,27 @@ static struct ib_device *__ib_device_get_by_name(const
> char *name)
>  	return NULL;
>  }
> 
> +struct ib_device *ib_device_get_by_name(const char *name,
> +					enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *device;
> +
> +	down_read(&lists_rwsem);
> +	device = __ib_device_get_by_name(name);
> +	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
> +	    device->driver_id != driver_id)
> +		device = NULL;
> +
> +	if (device) {
> +		/* Do not return a device if unregistration has started. */
> +		if (!refcount_inc_not_zero(&device->refcount))
> +			device = NULL;
> +	}
> +	up_read(&lists_rwsem);
> +	return device;
> +}
> +EXPORT_SYMBOL(ib_device_get_by_name);
> +
>  int ib_device_rename(struct ib_device *ibdev, const char *name)  {
>  	struct ib_device *device;
> @@ -641,28 +666,41 @@ int ib_register_device(struct ib_device *device,
> const char *name,  }  EXPORT_SYMBOL(ib_register_device);
> 
> -/**
> - * ib_unregister_device - Unregister an IB device
> - * @device:Device to unregister
> +/* Returns false if the device is already undergoing unregistration in
> +another
> + * thread, and the device pointer may be invalid upon return.
>   *
> - * Unregister an IB device.  All clients will receive a remove callback.
> + * If true is returned then the caller owns a kref associated with the
> + device
> + * (taken from the kref owned by the list).
> + *
> + * Upon return the device cannot be returned by any 'get' functions.
>   */
> -void ib_unregister_device(struct ib_device *device)
> +static bool ib_pre_unregister(struct ib_device *device) {
> +	bool already_removed;
> +
> +	mutex_lock(&device_mutex);
> +	down_write(&lists_rwsem);
> +	already_removed = list_empty(&device->core_list);
> +	list_del_init(&device->core_list);
> +	up_write(&lists_rwsem);
> +	mutex_unlock(&device_mutex);

No. you cannot unlock device_mutex here.
Now that device is removed from the list, another ib_register_device tries to create device with same name and fails to creates sysfs entries.

> +
> +	/* Matches the recound_set in ib_alloc_device */
Did you mean refcount set?

> +	ib_device_put(device);
> +
> +	return !already_removed;
> +}
> +
> +static void __ib_unregister_device(struct ib_device *device)
>  {
>  	struct ib_client_data *context, *tmp;
>  	unsigned long flags;
> 
> -	/*
> -	 * Wait for all netlink command callers to finish working on the
> -	 * device.
> -	 */
> -	ib_device_put(device);
>  	wait_for_completion(&device->unreg_completion);
> 
>  	mutex_lock(&device_mutex);
> 
>  	down_write(&lists_rwsem);
> -	list_del(&device->core_list);
>  	write_lock_irq(&device->client_data_lock);
>  	list_for_each_entry(context, &device->client_data_list, list)
>  		context->going_down = true;
> @@ -696,10 +734,116 @@ void ib_unregister_device(struct ib_device
> *device)
>  	up_write(&lists_rwsem);
> 
>  	device->reg_state = IB_DEV_UNREGISTERED;
> +
> +	/*
> +	 * Drivers using the new flow may not call ib_dealloc_device except
> +	 * in error unwind prior to registration success.
> +	 */
> +	if (device->driver_unregister) {
> +		device->driver_unregister(device);
> +		ib_dealloc_device(device);
> +	}
> +}
> +
> +/**
> + * ib_unregister_device - Unregister an IB device
> + * @device: The device to unregister
> + *
> + * Unregister an IB device.  All clients will receive a remove callback.
> + *
> + * Callers can call this routine only once, and must protect against
> + * races. Typically it should only be called as part of a remove
> +callback in
> + * an implementation of driver core's struct device_driver and related.
> + *
> + * A driver must choose to use either only ib_unregister_device or
> + * ib_unregister_device_and_put & ib_unregister_driver.
> + */
> +void ib_unregister_device(struct ib_device *device) {
> +	down_read(&unregister_rwsem);
> +	if (ib_pre_unregister(device))
> +		__ib_unregister_device(device);
> +	else
> +		WARN(true, "Driver mixing %s with _and_put", __func__);
> +	up_read(&unregister_rwsem);
>  }
>  EXPORT_SYMBOL(ib_unregister_device);
> 
>  /**
> + * ib_unregister_device_and_put - Unregister a device while holding a 'get'
> + * device: The device to unregister
> + *
> + * This is the same as ib_unregister_device(), except it includes an
> +internal
> + * ib_device_put() that should match a 'get' obtained by the caller.
> + *
> + * It is safe to call this routine concurrently from multiple threads
> +while
> + * holding the 'get', however in this case return from the function
> +does not
> + * guarantee the device is destroyed, only that destruction is in-progress.
> + *
> + * Drivers using this flow MUST use the driver_unregister callback to
> +clean up
> + * their resources associated with the device and dealloc it.
> + */
> +void ib_unregister_device_and_put(struct ib_device *device) {
> +	down_read(&unregister_rwsem);
> +	if (ib_pre_unregister(device)) {
> +		/*
> +		 * Caller's get can be returned now that we hold the kref,
> +		 * otherwise ib_pre_unregister returned the caller's get
> +		 */
> +		ib_device_put(device);
> +		__ib_unregister_device(device);
> +	}
> +	up_read(&unregister_rwsem);
> +}
> +EXPORT_SYMBOL(ib_unregister_device_and_put);
> +
> +/**
> + * ib_unregister_driver - Unregister all IB devices for a driver
> + * @driver_id: The driver to unregister
> + *
> + * This implements a fence for device unregistration. It only returns
> +once all
> + * devices associated with the driver_id have fully completed their
> + * unregistration and returned from ib_unregister_device*().
> + *
> + * If device's are not yet unregistered it goes ahead and starts
> +unregistering
> + * them.
> + *
> + * The caller must ensure that no devices can be be added while this is
> + * running (or at all for the module_unload case)  */ void
> +ib_unregister_driver(enum rdma_driver_id driver_id) {
> +	struct ib_device *device, *tmp;
> +	LIST_HEAD(driver_devs);
> +
> +	down_write(&unregister_rwsem);
> +	mutex_lock(&device_mutex);
> +	down_write(&lists_rwsem);
> +	list_for_each_entry_safe(device, tmp, &device_list, core_list) {
> +		if (device->driver_id == driver_id)
> +			list_move(&device->core_list, &driver_devs);
> +	}
> +	up_write(&lists_rwsem);
> +
> +	while ((device = list_first_entry_or_null(
> +			&driver_devs, struct ib_device, core_list))) {
> +		mutex_unlock(&device_mutex);
> +		/*
> +		 * We are holding the unregister_rwsem, so it is impossible
> +		 * for another thread to be doing registration.
registration or unregistration?

> +		 */
> +		if (!ib_pre_unregister(device))
> +			WARN(true, "Impossible failure of
> ib_pre_unregister");
> +		__ib_unregister_device(device);
> +		mutex_lock(&device_mutex);
> +	}
> +	mutex_unlock(&device_mutex);
> +	up_write(&unregister_rwsem);
> +}
> +EXPORT_SYMBOL(ib_unregister_driver);
> +
> +/**
>   * ib_register_client - Register an IB client
>   * @client:Client to register
>   *
> @@ -981,6 +1125,53 @@ void ib_enum_roce_netdev(struct ib_device
> *ib_dev,
>  		}
>  }
> 
> +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
> +					  enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *ib_dev;
> +
> +	down_read(&lists_rwsem);
> +	list_for_each_entry(ib_dev, &device_list, core_list) {
> +		unsigned int port;
> +
> +		if (driver_id != RDMA_DRIVER_UNKNOWN &&
> +		    ib_dev->driver_id != driver_id)
> +			continue;
> +
> +		if (!ib_dev->get_netdev)
> +			continue;
> +
> +		for (port = rdma_start_port(ib_dev);
> +		     port <= rdma_end_port(ib_dev); port++) {
> +			struct net_device *idev;
> +
> +			idev = ib_dev->get_netdev(ib_dev, port);
> +			if (idev != ndev) {
> +				if (idev)
> +					dev_put(idev);
> +				continue;
> +			}
> +
> +			/*
> +			 * Do not return a device if unregistration has
> +			 * started.
> +			 */
> +			if (!refcount_inc_not_zero(&ib_dev->refcount)) {
> +				dev_put(idev);
> +				continue;
> +			}
> +
> +			up_read(&lists_rwsem);
> +			dev_put(idev);
> +			return ib_dev;
> +		}
> +	}
> +	up_read(&lists_rwsem);
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL(ib_device_get_by_netdev);
> +
>  /**
>   * ib_enum_all_roce_netdevs - enumerate all RoCE devices
>   * @filter: Should we call the callback?
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 383e65c7bbc0..4069463f9a66 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -50,8 +50,10 @@ static void rxe_cleanup_ports(struct rxe_dev *rxe)
>  /* free resources for a rxe device all objects created for this device must
>   * have been destroyed
>   */
> -static void rxe_cleanup(struct rxe_dev *rxe)
> +void rxe_unregistered(struct ib_device *ib_dev)
>  {
> +	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
> +
>  	rxe_pool_cleanup(&rxe->uc_pool);
>  	rxe_pool_cleanup(&rxe->pd_pool);
>  	rxe_pool_cleanup(&rxe->ah_pool);
> @@ -68,15 +70,6 @@ static void rxe_cleanup(struct rxe_dev *rxe)
>  	crypto_free_shash(rxe->tfm);
>  }
> 
> -/* called when all references have been dropped */ -void rxe_release(struct
> kref *kref) -{
> -	struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
> -
> -	rxe_cleanup(rxe);
> -	ib_dealloc_device(&rxe->ib_dev);
> -}
> -
>  /* initialize rxe device parameters */
>  static void rxe_init_device_param(struct rxe_dev *rxe)  { @@ -279,7 +272,6
> @@ static int rxe_init(struct rxe_dev *rxe)
>  	spin_lock_init(&rxe->mmap_offset_lock);
>  	spin_lock_init(&rxe->pending_lock);
>  	INIT_LIST_HEAD(&rxe->pending_mmaps);
> -	INIT_LIST_HEAD(&rxe->list);
> 
>  	mutex_init(&rxe->usdev_lock);
> 
> @@ -312,31 +304,13 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
> {
>  	int err;
> 
> -	kref_init(&rxe->ref_cnt);
> -
>  	err = rxe_init(rxe);
>  	if (err)
> -		goto err1;
> +		return err;
> 
>  	rxe_set_mtu(rxe, mtu);
> 
> -	err = rxe_register_device(rxe);
> -	if (err)
> -		goto err1;
> -
> -	return 0;
> -
> -err1:
> -	rxe_dev_put(rxe);
> -	return err;
> -}
> -
> -/* called by the ifc layer to remove a device */ -void rxe_remove(struct
> rxe_dev *rxe) -{
> -	rxe_unregister_device(rxe);
> -
> -	rxe_dev_put(rxe);
> +	return rxe_register_device(rxe);
>  }
> 
>  static int __init rxe_module_init(void) @@ -360,7 +334,7 @@ static int
> __init rxe_module_init(void)
> 
>  static void __exit rxe_module_exit(void)  {
> -	rxe_remove_all();
> +	ib_unregister_driver(RDMA_DRIVER_RXE);
>  	rxe_net_exit();
>  	rxe_cache_exit();
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
> index 8f79bd86d033..75e7113a756d 100644
> --- a/drivers/infiniband/sw/rxe/rxe.h
> +++ b/drivers/infiniband/sw/rxe/rxe.h
> @@ -96,17 +96,19 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,  void
> rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
> 
>  int rxe_add(struct rxe_dev *rxe, unsigned int mtu); -void rxe_remove(struct
> rxe_dev *rxe); -void rxe_remove_all(void);
> 
>  void rxe_rcv(struct sk_buff *skb);
> 
> -static inline void rxe_dev_put(struct rxe_dev *rxe)
> +/* The caller must do a matching ib_device_put(&dev->ib_dev) */ static
> +inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
>  {
> -	kref_put(&rxe->ref_cnt, rxe_release);
> +	struct ib_device *ibdev =
> +		ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
> +
> +	if (!ibdev)
> +		return NULL;
> +	return container_of(ibdev, struct rxe_dev, ib_dev);
>  }
> -struct rxe_dev *net_to_rxe(struct net_device *ndev); -struct rxe_dev
> *get_rxe_by_name(const char *name);
> 
>  void rxe_port_up(struct rxe_dev *rxe);
>  void rxe_port_down(struct rxe_dev *rxe); diff --git
> a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index a675c9f2b427..e830d56a15c2 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -231,7 +231,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct
> rxe_srq *srq,
>  		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
>  		      struct rxe_modify_srq_cmd *ucmd);
> 
> -void rxe_release(struct kref *kref);
> +void rxe_unregistered(struct ib_device *ib_dev);
> 
>  int rxe_completer(void *arg);
>  int rxe_requester(void *arg);
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c
> b/drivers/infiniband/sw/rxe/rxe_net.c
> index b26a8141f3ed..c7637ad5e584 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -45,43 +45,6 @@
>  #include "rxe_net.h"
>  #include "rxe_loc.h"
> 
> -static LIST_HEAD(rxe_dev_list);
> -static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
> -
> -struct rxe_dev *net_to_rxe(struct net_device *ndev) -{
> -	struct rxe_dev *rxe;
> -	struct rxe_dev *found = NULL;
> -
> -	spin_lock_bh(&dev_list_lock);
> -	list_for_each_entry(rxe, &rxe_dev_list, list) {
> -		if (rxe->ndev == ndev) {
> -			found = rxe;
> -			break;
> -		}
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -
> -	return found;
> -}
> -
> -struct rxe_dev *get_rxe_by_name(const char *name) -{
> -	struct rxe_dev *rxe;
> -	struct rxe_dev *found = NULL;
> -
> -	spin_lock_bh(&dev_list_lock);
> -	list_for_each_entry(rxe, &rxe_dev_list, list) {
> -		if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) {
> -			found = rxe;
> -			break;
> -		}
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -	return found;
> -}
> -
> -
>  static struct rxe_recv_sockets recv_sockets;
> 
>  struct device *rxe_dma_device(struct rxe_dev *rxe) @@ -229,18 +192,19
> @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>  	struct udphdr *udph;
>  	struct net_device *ndev = skb->dev;
>  	struct net_device *rdev = ndev;
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
>  	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
> 
>  	if (!rxe && is_vlan_dev(rdev)) {
>  		rdev = vlan_dev_real_dev(ndev);
> -		rxe = net_to_rxe(rdev);
> +		rxe = rxe_get_dev_from_net(rdev);

Please, let's not lock-unlock a read semaphore on every incoming RoCE udp packet that too in NAPI's non-blocking context.

>  	}
>  	if (!rxe)
>  		goto drop;
> 
>  	if (skb_linearize(skb)) {
>  		pr_err("skb_linearize failed\n");
> +		ib_device_put(&rxe->ib_dev);
>  		goto drop;
>  	}
> 
> @@ -253,6 +217,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct
> sk_buff *skb)
> 
>  	rxe_rcv(skb);
> 
> +	/*
> +	 * FIXME: this is in the wrong place, it needs to be done when pkt is
> +	 * destroyed
> +	 */
> +	ib_device_put(&rxe->ib_dev);
> +
>  	return 0;
>  drop:
>  	kfree_skb(skb);
> @@ -558,36 +528,18 @@ struct rxe_dev *rxe_net_add(struct net_device
> *ndev)
>  	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
>  	if (!rxe)
>  		return NULL;
> -
> +	rxe->ib_dev.driver_unregister = rxe_unregistered;
>  	rxe->ndev = ndev;
> 
>  	err = rxe_add(rxe, ndev->mtu);
>  	if (err) {
> -		ib_dealloc_device(&rxe->ib_dev);
Unbalanced ib_alloc_device and ib_dealloc_device doesn't sound like a good approach.
There should be a better way to do this.

> +		rxe_unregistered(&rxe->ib_dev);
>  		return NULL;
>  	}
> 
> -	spin_lock_bh(&dev_list_lock);
> -	list_add_tail(&rxe->list, &rxe_dev_list);
> -	spin_unlock_bh(&dev_list_lock);
>  	return rxe;
>  }
> 
> -void rxe_remove_all(void)
> -{
> -	spin_lock_bh(&dev_list_lock);
> -	while (!list_empty(&rxe_dev_list)) {
> -		struct rxe_dev *rxe =
> -			list_first_entry(&rxe_dev_list, struct rxe_dev, list);
> -
> -		list_del(&rxe->list);
> -		spin_unlock_bh(&dev_list_lock);
> -		rxe_remove(rxe);
> -		spin_lock_bh(&dev_list_lock);
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -}
> -
>  static void rxe_port_event(struct rxe_dev *rxe,
>  			   enum ib_event_type event)
>  {
> @@ -630,16 +582,16 @@ static int rxe_notify(struct notifier_block *not_blk,
>  		      void *arg)
>  {
>  	struct net_device *ndev = netdev_notifier_info_to_dev(arg);
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
> 
>  	if (!rxe)
> -		goto out;
> +		return NOTIFY_OK;
> 
>  	switch (event) {
>  	case NETDEV_UNREGISTER:
> -		list_del(&rxe->list);
> -		rxe_remove(rxe);
> -		break;
> +		ib_unregister_device_and_put(&rxe->ib_dev);
> +		return NOTIFY_OK;
> +
>  	case NETDEV_UP:
>  		rxe_port_up(rxe);
>  		break;
> @@ -666,7 +618,8 @@ static int rxe_notify(struct notifier_block *not_blk,
>  			event, ndev->name);
>  		break;
>  	}
> -out:
> +
> +	ib_device_put(&rxe->ib_dev);
>  	return NOTIFY_OK;
>  }
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c
> b/drivers/infiniband/sw/rxe/rxe_pool.c
> index a04a076e03af..bb7e3268713e 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> @@ -390,8 +390,6 @@ void *rxe_alloc(struct rxe_pool *pool)
>  	kref_get(&pool->ref_cnt);
>  	read_unlock_irqrestore(&pool->pool_lock, flags);
> 
> -	kref_get(&pool->rxe->ref_cnt);
> -
>  	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>  		goto out_put_pool;
> 
> @@ -408,7 +406,6 @@ void *rxe_alloc(struct rxe_pool *pool)
> 
>  out_put_pool:
>  	atomic_dec(&pool->num_elem);
> -	rxe_dev_put(pool->rxe);
>  	rxe_pool_put(pool);
>  	return NULL;
>  }
> @@ -424,7 +421,6 @@ void rxe_elem_release(struct kref *kref)
> 
>  	kmem_cache_free(pool_cache(pool), elem);
>  	atomic_dec(&pool->num_elem);
> -	rxe_dev_put(pool->rxe);
>  	rxe_pool_put(pool);
>  }
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c
> b/drivers/infiniband/sw/rxe/rxe_sysfs.c
> index 73a19f808e1b..87c17675cf9d 100644
> --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
> @@ -53,20 +53,14 @@ static int sanitize_arg(const char *val, char *intf, int
> intf_len)
>  	return len;
>  }
> 
> -static void rxe_set_port_state(struct net_device *ndev)
> +static void rxe_set_port_state(struct rxe_dev *rxe, struct net_device
> +*ndev)
>  {
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
>  	bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
> 
> -	if (!rxe)
> -		goto out;
> -
>  	if (is_up)
>  		rxe_port_up(rxe);
>  	else
>  		rxe_port_down(rxe); /* down for unknown state */
> -out:
> -	return;
>  }
> 
>  static int rxe_param_set_add(const char *val, const struct kernel_param
> *kp) @@ -74,24 +68,25 @@ static int rxe_param_set_add(const char *val,
> const struct kernel_param *kp)
>  	int len;
>  	int err = 0;
>  	char intf[32];
> -	struct net_device *ndev = NULL;
> +	struct net_device *ndev;
> +	struct rxe_dev *exists;
>  	struct rxe_dev *rxe;
> 
>  	len = sanitize_arg(val, intf, sizeof(intf));
>  	if (!len) {
>  		pr_err("add: invalid interface name\n");
> -		err = -EINVAL;
> -		goto err;
> +		return -EINVAL;
>  	}
> 
>  	ndev = dev_get_by_name(&init_net, intf);
>  	if (!ndev) {
>  		pr_err("interface %s not found\n", intf);
> -		err = -EINVAL;
> -		goto err;
> +		return -EINVAL;
>  	}
> 
> -	if (net_to_rxe(ndev)) {
> +	exists = rxe_get_dev_from_net(ndev);
> +	if (exists) {
> +		ib_device_put(&exists->ib_dev);
>  		pr_err("already configured on %s\n", intf);
>  		err = -EINVAL;
>  		goto err;
> @@ -104,11 +99,11 @@ static int rxe_param_set_add(const char *val, const
> struct kernel_param *kp)
>  		goto err;
>  	}
> 
> -	rxe_set_port_state(ndev);
> +	rxe_set_port_state(rxe, ndev);
>  	dev_info(&rxe->ib_dev.dev, "added %s\n", intf);
> +
>  err:
> -	if (ndev)
> -		dev_put(ndev);
> +	dev_put(ndev);
>  	return err;
>  }
> 
> @@ -116,7 +111,7 @@ static int rxe_param_set_remove(const char *val,
> const struct kernel_param *kp)  {
>  	int len;
>  	char intf[32];
> -	struct rxe_dev *rxe;
> +	struct ib_device *ib_dev;
> 
>  	len = sanitize_arg(val, intf, sizeof(intf));
>  	if (!len) {
> @@ -126,20 +121,18 @@ static int rxe_param_set_remove(const char *val,
> const struct kernel_param *kp)
> 
>  	if (strncmp("all", intf, len) == 0) {
>  		pr_info("rxe_sys: remove all");
> -		rxe_remove_all();
> +		ib_unregister_driver(RDMA_DRIVER_RXE);
>  		return 0;
>  	}
> 
> -	rxe = get_rxe_by_name(intf);
> +	ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE);
> 
> -	if (!rxe) {
> +	if (!ib_dev) {
>  		pr_err("not configured on %s\n", intf);
>  		return -EINVAL;
>  	}
> 
> -	list_del(&rxe->list);
> -	rxe_remove(rxe);
> -
> +	ib_unregister_device_and_put(ib_dev);
>  	return 0;
>  }
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c
> b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 30817c79ba96..28beee8f5838 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1286,10 +1286,3 @@ int rxe_register_device(struct rxe_dev *rxe)
> 
>  	return err;
>  }
> -
> -void rxe_unregister_device(struct rxe_dev *rxe) -{
> -	struct ib_device *dev = &rxe->ib_dev;
> -
> -	ib_unregister_device(dev);
> -}
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h
> b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 831381b7788d..a0b635812784 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -385,7 +385,6 @@ struct rxe_dev {
>  	struct ib_device_attr	attr;
>  	int			max_ucontext;
>  	int			max_inline_data;
> -	struct kref		ref_cnt;
>  	struct mutex	usdev_lock;
> 
>  	struct net_device	*ndev;
> @@ -412,7 +411,6 @@ struct rxe_dev {
>  	u64			stats_counters[RXE_NUM_OF_COUNTERS];
> 
>  	struct rxe_port		port;
> -	struct list_head	list;
>  	struct crypto_shash	*tfm;
>  };
> 
> @@ -467,7 +465,6 @@ static inline struct rxe_mem *to_rmw(struct ib_mw
> *mw)  }
> 
>  int rxe_register_device(struct rxe_dev *rxe); -void
> rxe_unregister_device(struct rxe_dev *rxe);
> 
>  void rxe_mc_cleanup(struct rxe_pool_entry *arg);
> 
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> 85021451eee0..5fbeedacb3aa 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -2534,6 +2534,9 @@ struct ib_device {
>  				 struct ib_counters_read_attr
> *counters_read_attr,
>  				 struct uverbs_attr_bundle *attrs);
> 
> +	/* Called after all clients have been destroyed*/
> +	void (*driver_unregister)(struct ib_device *dev);
> +
>  	/**
>  	 * rdma netdev operation
>  	 *
> @@ -2653,6 +2656,8 @@ int ib_register_device(struct ib_device *device,
> const char *name,
>  		       int (*port_callback)(struct ib_device *, u8,
>  					    struct kobject *));
>  void ib_unregister_device(struct ib_device *device);
> +void ib_unregister_driver(enum rdma_driver_id driver_id); void
> +ib_unregister_device_and_put(struct ib_device *device);
> 
>  int ib_register_client   (struct ib_client *client);
>  void ib_unregister_client(struct ib_client *client); @@ -3937,6 +3942,11 @@
> static inline bool ib_access_writable(int access_flags)  int
> ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
>  		       struct ib_mr_status *mr_status);
> 
> +void ib_device_put(struct ib_device *device); struct ib_device
> +*ib_device_get_by_netdev(struct net_device *ndev,
> +					  enum rdma_driver_id driver_id);
> +struct ib_device *ib_device_get_by_name(const char *name,
> +					enum rdma_driver_id driver_id);
>  struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8
> port,
>  					    u16 pkey, const union ib_gid *gid,
>  					    const struct sockaddr *addr);
> --
> 1.8.3.1

I think driver unload and device unregistration related complexity should be left in the rxe driver.
I do not have any better locking suggestion at this point, haven't followed the email discussion.
But core is surely getting undesired locking here along with unbalanced deallocs that deserves a better method.
Zhu Yanjun Dec. 15, 2018, 5:16 a.m. UTC | #2
On 2018/12/15 上午12:05, Jason Gunthorpe wrote:
> Replace
>   - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
>   - rxe_dev_list with core's device_list.
>   - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
>     lifetime management
>   - Buggy get_rxe_by_name with ib_device_get_by_name
>
> Add
>   - ib_unregister_driver so drivers like rxe don't have to keep track
>     of things to do module unload
>   - ib_unregister_device_and_put to work with potiners returned by
>     that various ib_device_get_* functions
>
> The refcount change to the pool might need more work, but fundamentally
> the driver cannot have krefs open in the pool across unregistration or it
> contains module unlolad races.
>
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> ---
>   drivers/infiniband/core/core_priv.h   |   1 -
>   drivers/infiniband/core/device.c      | 213 ++++++++++++++++++++++++++++++++--
>   drivers/infiniband/sw/rxe/rxe.c       |  38 +-----
>   drivers/infiniband/sw/rxe/rxe.h       |  14 ++-
>   drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
>   drivers/infiniband/sw/rxe/rxe_net.c   |  83 +++----------
>   drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
>   drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 +++----
>   drivers/infiniband/sw/rxe/rxe_verbs.c |   7 --
>   drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
>   include/rdma/ib_verbs.h               |  10 ++
>   11 files changed, 261 insertions(+), 153 deletions(-)
>
> diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
> index cc7535c5e192..1b2575430032 100644
> --- a/drivers/infiniband/core/core_priv.h
> +++ b/drivers/infiniband/core/core_priv.h
> @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
>   #endif
>   
>   struct ib_device *ib_device_get_by_index(u32 ifindex);
> -void ib_device_put(struct ib_device *device);
>   /* RDMA device netlink */
>   void nldev_init(void);
>   void nldev_exit(void);
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 348a7fb1f945..a2523605e02a 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -85,6 +85,9 @@ struct ib_client_data {
>   static DEFINE_MUTEX(device_mutex);
>   static DECLARE_RWSEM(lists_rwsem);
>   
> +/* Used to serialize ib_unregister_driver */
> +static DECLARE_RWSEM(unregister_rwsem);
> +
>   static int ib_security_change(struct notifier_block *nb, unsigned long event,
>   			      void *lsm_data);
>   static void ib_policy_change_task(struct work_struct *work);
> @@ -168,6 +171,7 @@ void ib_device_put(struct ib_device *device)

Where is this function ib_device_put? This function is used many times.

Zhu Yanjun

>   	if (refcount_dec_and_test(&device->refcount))
>   		complete(&device->unreg_completion);
>   }
> +EXPORT_SYMBOL(ib_device_put);
>   
>   static struct ib_device *__ib_device_get_by_name(const char *name)
>   {
> @@ -180,6 +184,27 @@ static struct ib_device *__ib_device_get_by_name(const char *name)
>   	return NULL;
>   }
>   
> +struct ib_device *ib_device_get_by_name(const char *name,
> +					enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *device;
> +
> +	down_read(&lists_rwsem);
> +	device = __ib_device_get_by_name(name);
> +	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
> +	    device->driver_id != driver_id)
> +		device = NULL;
> +
> +	if (device) {
> +		/* Do not return a device if unregistration has started. */
> +		if (!refcount_inc_not_zero(&device->refcount))
> +			device = NULL;
> +	}
> +	up_read(&lists_rwsem);
> +	return device;
> +}
> +EXPORT_SYMBOL(ib_device_get_by_name);
> +
>   int ib_device_rename(struct ib_device *ibdev, const char *name)
>   {
>   	struct ib_device *device;
> @@ -641,28 +666,41 @@ int ib_register_device(struct ib_device *device, const char *name,
>   }
>   EXPORT_SYMBOL(ib_register_device);
>   
> -/**
> - * ib_unregister_device - Unregister an IB device
> - * @device:Device to unregister
> +/* Returns false if the device is already undergoing unregistration in another
> + * thread, and the device pointer may be invalid upon return.
>    *
> - * Unregister an IB device.  All clients will receive a remove callback.
> + * If true is returned then the caller owns a kref associated with the device
> + * (taken from the kref owned by the list).
> + *
> + * Upon return the device cannot be returned by any 'get' functions.
>    */
> -void ib_unregister_device(struct ib_device *device)
> +static bool ib_pre_unregister(struct ib_device *device)
> +{
> +	bool already_removed;
> +
> +	mutex_lock(&device_mutex);
> +	down_write(&lists_rwsem);
> +	already_removed = list_empty(&device->core_list);
> +	list_del_init(&device->core_list);
> +	up_write(&lists_rwsem);
> +	mutex_unlock(&device_mutex);
> +
> +	/* Matches the recound_set in ib_alloc_device */
> +	ib_device_put(device);
> +
> +	return !already_removed;
> +}
> +
> +static void __ib_unregister_device(struct ib_device *device)
>   {
>   	struct ib_client_data *context, *tmp;
>   	unsigned long flags;
>   
> -	/*
> -	 * Wait for all netlink command callers to finish working on the
> -	 * device.
> -	 */
> -	ib_device_put(device);
>   	wait_for_completion(&device->unreg_completion);
>   
>   	mutex_lock(&device_mutex);
>   
>   	down_write(&lists_rwsem);
> -	list_del(&device->core_list);
>   	write_lock_irq(&device->client_data_lock);
>   	list_for_each_entry(context, &device->client_data_list, list)
>   		context->going_down = true;
> @@ -696,10 +734,116 @@ void ib_unregister_device(struct ib_device *device)
>   	up_write(&lists_rwsem);
>   
>   	device->reg_state = IB_DEV_UNREGISTERED;
> +
> +	/*
> +	 * Drivers using the new flow may not call ib_dealloc_device except
> +	 * in error unwind prior to registration success.
> +	 */
> +	if (device->driver_unregister) {
> +		device->driver_unregister(device);
> +		ib_dealloc_device(device);
> +	}
> +}
> +
> +/**
> + * ib_unregister_device - Unregister an IB device
> + * @device: The device to unregister
> + *
> + * Unregister an IB device.  All clients will receive a remove callback.
> + *
> + * Callers can call this routine only once, and must protect against
> + * races. Typically it should only be called as part of a remove callback in
> + * an implementation of driver core's struct device_driver and related.
> + *
> + * A driver must choose to use either only ib_unregister_device or
> + * ib_unregister_device_and_put & ib_unregister_driver.
> + */
> +void ib_unregister_device(struct ib_device *device)
> +{
> +	down_read(&unregister_rwsem);
> +	if (ib_pre_unregister(device))
> +		__ib_unregister_device(device);
> +	else
> +		WARN(true, "Driver mixing %s with _and_put", __func__);
> +	up_read(&unregister_rwsem);
>   }
>   EXPORT_SYMBOL(ib_unregister_device);
>   
>   /**
> + * ib_unregister_device_and_put - Unregister a device while holding a 'get'
> + * device: The device to unregister
> + *
> + * This is the same as ib_unregister_device(), except it includes an internal
> + * ib_device_put() that should match a 'get' obtained by the caller.
> + *
> + * It is safe to call this routine concurrently from multiple threads while
> + * holding the 'get', however in this case return from the function does not
> + * guarantee the device is destroyed, only that destruction is in-progress.
> + *
> + * Drivers using this flow MUST use the driver_unregister callback to clean up
> + * their resources associated with the device and dealloc it.
> + */
> +void ib_unregister_device_and_put(struct ib_device *device)
> +{
> +	down_read(&unregister_rwsem);
> +	if (ib_pre_unregister(device)) {
> +		/*
> +		 * Caller's get can be returned now that we hold the kref,
> +		 * otherwise ib_pre_unregister returned the caller's get
> +		 */
> +		ib_device_put(device);
> +		__ib_unregister_device(device);
> +	}
> +	up_read(&unregister_rwsem);
> +}
> +EXPORT_SYMBOL(ib_unregister_device_and_put);
> +
> +/**
> + * ib_unregister_driver - Unregister all IB devices for a driver
> + * @driver_id: The driver to unregister
> + *
> + * This implements a fence for device unregistration. It only returns once all
> + * devices associated with the driver_id have fully completed their
> + * unregistration and returned from ib_unregister_device*().
> + *
> + * If device's are not yet unregistered it goes ahead and starts unregistering
> + * them.
> + *
> + * The caller must ensure that no devices can be be added while this is
> + * running (or at all for the module_unload case)
> + */
> +void ib_unregister_driver(enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *device, *tmp;
> +	LIST_HEAD(driver_devs);
> +
> +	down_write(&unregister_rwsem);
> +	mutex_lock(&device_mutex);
> +	down_write(&lists_rwsem);
> +	list_for_each_entry_safe(device, tmp, &device_list, core_list) {
> +		if (device->driver_id == driver_id)
> +			list_move(&device->core_list, &driver_devs);
> +	}
> +	up_write(&lists_rwsem);
> +
> +	while ((device = list_first_entry_or_null(
> +			&driver_devs, struct ib_device, core_list))) {
> +		mutex_unlock(&device_mutex);
> +		/*
> +		 * We are holding the unregister_rwsem, so it is impossible
> +		 * for another thread to be doing registration.
> +		 */
> +		if (!ib_pre_unregister(device))
> +			WARN(true, "Impossible failure of ib_pre_unregister");
> +		__ib_unregister_device(device);
> +		mutex_lock(&device_mutex);
> +	}
> +	mutex_unlock(&device_mutex);
> +	up_write(&unregister_rwsem);
> +}
> +EXPORT_SYMBOL(ib_unregister_driver);
> +
> +/**
>    * ib_register_client - Register an IB client
>    * @client:Client to register
>    *
> @@ -981,6 +1125,53 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
>   		}
>   }
>   
> +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
> +					  enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *ib_dev;
> +
> +	down_read(&lists_rwsem);
> +	list_for_each_entry(ib_dev, &device_list, core_list) {
> +		unsigned int port;
> +
> +		if (driver_id != RDMA_DRIVER_UNKNOWN &&
> +		    ib_dev->driver_id != driver_id)
> +			continue;
> +
> +		if (!ib_dev->get_netdev)
> +			continue;
> +
> +		for (port = rdma_start_port(ib_dev);
> +		     port <= rdma_end_port(ib_dev); port++) {
> +			struct net_device *idev;
> +
> +			idev = ib_dev->get_netdev(ib_dev, port);
> +			if (idev != ndev) {
> +				if (idev)
> +					dev_put(idev);
> +				continue;
> +			}
> +
> +			/*
> +			 * Do not return a device if unregistration has
> +			 * started.
> +			 */
> +			if (!refcount_inc_not_zero(&ib_dev->refcount)) {
> +				dev_put(idev);
> +				continue;
> +			}
> +
> +			up_read(&lists_rwsem);
> +			dev_put(idev);
> +			return ib_dev;
> +		}
> +	}
> +	up_read(&lists_rwsem);
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL(ib_device_get_by_netdev);
> +
>   /**
>    * ib_enum_all_roce_netdevs - enumerate all RoCE devices
>    * @filter: Should we call the callback?
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 383e65c7bbc0..4069463f9a66 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -50,8 +50,10 @@ static void rxe_cleanup_ports(struct rxe_dev *rxe)
>   /* free resources for a rxe device all objects created for this device must
>    * have been destroyed
>    */
> -static void rxe_cleanup(struct rxe_dev *rxe)
> +void rxe_unregistered(struct ib_device *ib_dev)
>   {
> +	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
> +
>   	rxe_pool_cleanup(&rxe->uc_pool);
>   	rxe_pool_cleanup(&rxe->pd_pool);
>   	rxe_pool_cleanup(&rxe->ah_pool);
> @@ -68,15 +70,6 @@ static void rxe_cleanup(struct rxe_dev *rxe)
>   	crypto_free_shash(rxe->tfm);
>   }
>   
> -/* called when all references have been dropped */
> -void rxe_release(struct kref *kref)
> -{
> -	struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
> -
> -	rxe_cleanup(rxe);
> -	ib_dealloc_device(&rxe->ib_dev);
> -}
> -
>   /* initialize rxe device parameters */
>   static void rxe_init_device_param(struct rxe_dev *rxe)
>   {
> @@ -279,7 +272,6 @@ static int rxe_init(struct rxe_dev *rxe)
>   	spin_lock_init(&rxe->mmap_offset_lock);
>   	spin_lock_init(&rxe->pending_lock);
>   	INIT_LIST_HEAD(&rxe->pending_mmaps);
> -	INIT_LIST_HEAD(&rxe->list);
>   
>   	mutex_init(&rxe->usdev_lock);
>   
> @@ -312,31 +304,13 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
>   {
>   	int err;
>   
> -	kref_init(&rxe->ref_cnt);
> -
>   	err = rxe_init(rxe);
>   	if (err)
> -		goto err1;
> +		return err;
>   
>   	rxe_set_mtu(rxe, mtu);
>   
> -	err = rxe_register_device(rxe);
> -	if (err)
> -		goto err1;
> -
> -	return 0;
> -
> -err1:
> -	rxe_dev_put(rxe);
> -	return err;
> -}
> -
> -/* called by the ifc layer to remove a device */
> -void rxe_remove(struct rxe_dev *rxe)
> -{
> -	rxe_unregister_device(rxe);
> -
> -	rxe_dev_put(rxe);
> +	return rxe_register_device(rxe);
>   }
>   
>   static int __init rxe_module_init(void)
> @@ -360,7 +334,7 @@ static int __init rxe_module_init(void)
>   
>   static void __exit rxe_module_exit(void)
>   {
> -	rxe_remove_all();
> +	ib_unregister_driver(RDMA_DRIVER_RXE);
>   	rxe_net_exit();
>   	rxe_cache_exit();
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
> index 8f79bd86d033..75e7113a756d 100644
> --- a/drivers/infiniband/sw/rxe/rxe.h
> +++ b/drivers/infiniband/sw/rxe/rxe.h
> @@ -96,17 +96,19 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
>   void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
>   
>   int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
> -void rxe_remove(struct rxe_dev *rxe);
> -void rxe_remove_all(void);
>   
>   void rxe_rcv(struct sk_buff *skb);
>   
> -static inline void rxe_dev_put(struct rxe_dev *rxe)
> +/* The caller must do a matching ib_device_put(&dev->ib_dev) */
> +static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
>   {
> -	kref_put(&rxe->ref_cnt, rxe_release);
> +	struct ib_device *ibdev =
> +		ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
> +
> +	if (!ibdev)
> +		return NULL;
> +	return container_of(ibdev, struct rxe_dev, ib_dev);
>   }
> -struct rxe_dev *net_to_rxe(struct net_device *ndev);
> -struct rxe_dev *get_rxe_by_name(const char *name);
>   
>   void rxe_port_up(struct rxe_dev *rxe);
>   void rxe_port_down(struct rxe_dev *rxe);
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index a675c9f2b427..e830d56a15c2 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -231,7 +231,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
>   		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
>   		      struct rxe_modify_srq_cmd *ucmd);
>   
> -void rxe_release(struct kref *kref);
> +void rxe_unregistered(struct ib_device *ib_dev);
>   
>   int rxe_completer(void *arg);
>   int rxe_requester(void *arg);
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index b26a8141f3ed..c7637ad5e584 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -45,43 +45,6 @@
>   #include "rxe_net.h"
>   #include "rxe_loc.h"
>   
> -static LIST_HEAD(rxe_dev_list);
> -static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
> -
> -struct rxe_dev *net_to_rxe(struct net_device *ndev)
> -{
> -	struct rxe_dev *rxe;
> -	struct rxe_dev *found = NULL;
> -
> -	spin_lock_bh(&dev_list_lock);
> -	list_for_each_entry(rxe, &rxe_dev_list, list) {
> -		if (rxe->ndev == ndev) {
> -			found = rxe;
> -			break;
> -		}
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -
> -	return found;
> -}
> -
> -struct rxe_dev *get_rxe_by_name(const char *name)
> -{
> -	struct rxe_dev *rxe;
> -	struct rxe_dev *found = NULL;
> -
> -	spin_lock_bh(&dev_list_lock);
> -	list_for_each_entry(rxe, &rxe_dev_list, list) {
> -		if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) {
> -			found = rxe;
> -			break;
> -		}
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -	return found;
> -}
> -
> -
>   static struct rxe_recv_sockets recv_sockets;
>   
>   struct device *rxe_dma_device(struct rxe_dev *rxe)
> @@ -229,18 +192,19 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>   	struct udphdr *udph;
>   	struct net_device *ndev = skb->dev;
>   	struct net_device *rdev = ndev;
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
>   	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
>   
>   	if (!rxe && is_vlan_dev(rdev)) {
>   		rdev = vlan_dev_real_dev(ndev);
> -		rxe = net_to_rxe(rdev);
> +		rxe = rxe_get_dev_from_net(rdev);
>   	}
>   	if (!rxe)
>   		goto drop;
>   
>   	if (skb_linearize(skb)) {
>   		pr_err("skb_linearize failed\n");
> +		ib_device_put(&rxe->ib_dev);
>   		goto drop;
>   	}
>   
> @@ -253,6 +217,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>   
>   	rxe_rcv(skb);
>   
> +	/*
> +	 * FIXME: this is in the wrong place, it needs to be done when pkt is
> +	 * destroyed
> +	 */
> +	ib_device_put(&rxe->ib_dev);
> +
>   	return 0;
>   drop:
>   	kfree_skb(skb);
> @@ -558,36 +528,18 @@ struct rxe_dev *rxe_net_add(struct net_device *ndev)
>   	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
>   	if (!rxe)
>   		return NULL;
> -
> +	rxe->ib_dev.driver_unregister = rxe_unregistered;
>   	rxe->ndev = ndev;
>   
>   	err = rxe_add(rxe, ndev->mtu);
>   	if (err) {
> -		ib_dealloc_device(&rxe->ib_dev);
> +		rxe_unregistered(&rxe->ib_dev);
>   		return NULL;
>   	}
>   
> -	spin_lock_bh(&dev_list_lock);
> -	list_add_tail(&rxe->list, &rxe_dev_list);
> -	spin_unlock_bh(&dev_list_lock);
>   	return rxe;
>   }
>   
> -void rxe_remove_all(void)
> -{
> -	spin_lock_bh(&dev_list_lock);
> -	while (!list_empty(&rxe_dev_list)) {
> -		struct rxe_dev *rxe =
> -			list_first_entry(&rxe_dev_list, struct rxe_dev, list);
> -
> -		list_del(&rxe->list);
> -		spin_unlock_bh(&dev_list_lock);
> -		rxe_remove(rxe);
> -		spin_lock_bh(&dev_list_lock);
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -}
> -
>   static void rxe_port_event(struct rxe_dev *rxe,
>   			   enum ib_event_type event)
>   {
> @@ -630,16 +582,16 @@ static int rxe_notify(struct notifier_block *not_blk,
>   		      void *arg)
>   {
>   	struct net_device *ndev = netdev_notifier_info_to_dev(arg);
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
>   
>   	if (!rxe)
> -		goto out;
> +		return NOTIFY_OK;
>   
>   	switch (event) {
>   	case NETDEV_UNREGISTER:
> -		list_del(&rxe->list);
> -		rxe_remove(rxe);
> -		break;
> +		ib_unregister_device_and_put(&rxe->ib_dev);
> +		return NOTIFY_OK;
> +
>   	case NETDEV_UP:
>   		rxe_port_up(rxe);
>   		break;
> @@ -666,7 +618,8 @@ static int rxe_notify(struct notifier_block *not_blk,
>   			event, ndev->name);
>   		break;
>   	}
> -out:
> +
> +	ib_device_put(&rxe->ib_dev);
>   	return NOTIFY_OK;
>   }
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> index a04a076e03af..bb7e3268713e 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> @@ -390,8 +390,6 @@ void *rxe_alloc(struct rxe_pool *pool)
>   	kref_get(&pool->ref_cnt);
>   	read_unlock_irqrestore(&pool->pool_lock, flags);
>   
> -	kref_get(&pool->rxe->ref_cnt);
> -
>   	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>   		goto out_put_pool;
>   
> @@ -408,7 +406,6 @@ void *rxe_alloc(struct rxe_pool *pool)
>   
>   out_put_pool:
>   	atomic_dec(&pool->num_elem);
> -	rxe_dev_put(pool->rxe);
>   	rxe_pool_put(pool);
>   	return NULL;
>   }
> @@ -424,7 +421,6 @@ void rxe_elem_release(struct kref *kref)
>   
>   	kmem_cache_free(pool_cache(pool), elem);
>   	atomic_dec(&pool->num_elem);
> -	rxe_dev_put(pool->rxe);
>   	rxe_pool_put(pool);
>   }
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
> index 73a19f808e1b..87c17675cf9d 100644
> --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
> @@ -53,20 +53,14 @@ static int sanitize_arg(const char *val, char *intf, int intf_len)
>   	return len;
>   }
>   
> -static void rxe_set_port_state(struct net_device *ndev)
> +static void rxe_set_port_state(struct rxe_dev *rxe, struct net_device *ndev)
>   {
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
>   	bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
>   
> -	if (!rxe)
> -		goto out;
> -
>   	if (is_up)
>   		rxe_port_up(rxe);
>   	else
>   		rxe_port_down(rxe); /* down for unknown state */
> -out:
> -	return;
>   }
>   
>   static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
> @@ -74,24 +68,25 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
>   	int len;
>   	int err = 0;
>   	char intf[32];
> -	struct net_device *ndev = NULL;
> +	struct net_device *ndev;
> +	struct rxe_dev *exists;
>   	struct rxe_dev *rxe;
>   
>   	len = sanitize_arg(val, intf, sizeof(intf));
>   	if (!len) {
>   		pr_err("add: invalid interface name\n");
> -		err = -EINVAL;
> -		goto err;
> +		return -EINVAL;
>   	}
>   
>   	ndev = dev_get_by_name(&init_net, intf);
>   	if (!ndev) {
>   		pr_err("interface %s not found\n", intf);
> -		err = -EINVAL;
> -		goto err;
> +		return -EINVAL;
>   	}
>   
> -	if (net_to_rxe(ndev)) {
> +	exists = rxe_get_dev_from_net(ndev);
> +	if (exists) {
> +		ib_device_put(&exists->ib_dev);
>   		pr_err("already configured on %s\n", intf);
>   		err = -EINVAL;
>   		goto err;
> @@ -104,11 +99,11 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
>   		goto err;
>   	}
>   
> -	rxe_set_port_state(ndev);
> +	rxe_set_port_state(rxe, ndev);
>   	dev_info(&rxe->ib_dev.dev, "added %s\n", intf);
> +
>   err:
> -	if (ndev)
> -		dev_put(ndev);
> +	dev_put(ndev);
>   	return err;
>   }
>   
> @@ -116,7 +111,7 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
>   {
>   	int len;
>   	char intf[32];
> -	struct rxe_dev *rxe;
> +	struct ib_device *ib_dev;
>   
>   	len = sanitize_arg(val, intf, sizeof(intf));
>   	if (!len) {
> @@ -126,20 +121,18 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
>   
>   	if (strncmp("all", intf, len) == 0) {
>   		pr_info("rxe_sys: remove all");
> -		rxe_remove_all();
> +		ib_unregister_driver(RDMA_DRIVER_RXE);
>   		return 0;
>   	}
>   
> -	rxe = get_rxe_by_name(intf);
> +	ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE);
>   
> -	if (!rxe) {
> +	if (!ib_dev) {
>   		pr_err("not configured on %s\n", intf);
>   		return -EINVAL;
>   	}
>   
> -	list_del(&rxe->list);
> -	rxe_remove(rxe);
> -
> +	ib_unregister_device_and_put(ib_dev);
>   	return 0;
>   }
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 30817c79ba96..28beee8f5838 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1286,10 +1286,3 @@ int rxe_register_device(struct rxe_dev *rxe)
>   
>   	return err;
>   }
> -
> -void rxe_unregister_device(struct rxe_dev *rxe)
> -{
> -	struct ib_device *dev = &rxe->ib_dev;
> -
> -	ib_unregister_device(dev);
> -}
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 831381b7788d..a0b635812784 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -385,7 +385,6 @@ struct rxe_dev {
>   	struct ib_device_attr	attr;
>   	int			max_ucontext;
>   	int			max_inline_data;
> -	struct kref		ref_cnt;
>   	struct mutex	usdev_lock;
>   
>   	struct net_device	*ndev;
> @@ -412,7 +411,6 @@ struct rxe_dev {
>   	u64			stats_counters[RXE_NUM_OF_COUNTERS];
>   
>   	struct rxe_port		port;
> -	struct list_head	list;
>   	struct crypto_shash	*tfm;
>   };
>   
> @@ -467,7 +465,6 @@ static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
>   }
>   
>   int rxe_register_device(struct rxe_dev *rxe);
> -void rxe_unregister_device(struct rxe_dev *rxe);
>   
>   void rxe_mc_cleanup(struct rxe_pool_entry *arg);
>   
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 85021451eee0..5fbeedacb3aa 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -2534,6 +2534,9 @@ struct ib_device {
>   				 struct ib_counters_read_attr *counters_read_attr,
>   				 struct uverbs_attr_bundle *attrs);
>   
> +	/* Called after all clients have been destroyed*/
> +	void (*driver_unregister)(struct ib_device *dev);
> +
>   	/**
>   	 * rdma netdev operation
>   	 *
> @@ -2653,6 +2656,8 @@ int ib_register_device(struct ib_device *device, const char *name,
>   		       int (*port_callback)(struct ib_device *, u8,
>   					    struct kobject *));
>   void ib_unregister_device(struct ib_device *device);
> +void ib_unregister_driver(enum rdma_driver_id driver_id);
> +void ib_unregister_device_and_put(struct ib_device *device);
>   
>   int ib_register_client   (struct ib_client *client);
>   void ib_unregister_client(struct ib_client *client);
> @@ -3937,6 +3942,11 @@ static inline bool ib_access_writable(int access_flags)
>   int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
>   		       struct ib_mr_status *mr_status);
>   
> +void ib_device_put(struct ib_device *device);
> +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
> +					  enum rdma_driver_id driver_id);
> +struct ib_device *ib_device_get_by_name(const char *name,
> +					enum rdma_driver_id driver_id);
>   struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
>   					    u16 pkey, const union ib_gid *gid,
>   					    const struct sockaddr *addr);
Zhu Yanjun Dec. 15, 2018, 5:33 a.m. UTC | #3
On 2018/12/15 上午12:05, Jason Gunthorpe wrote:
> Replace
>   - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
>   - rxe_dev_list with core's device_list.
>   - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
>     lifetime management
>   - Buggy get_rxe_by_name with ib_device_get_by_name
>
> Add
>   - ib_unregister_driver so drivers like rxe don't have to keep track
>     of things to do module unload
>   - ib_unregister_device_and_put to work with potiners returned by
>     that various ib_device_get_* functions
>
> The refcount change to the pool might need more work, but fundamentally
> the driver cannot have krefs open in the pool across unregistration or it
> contains module unlolad races.
>
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> ---
>   drivers/infiniband/core/core_priv.h   |   1 -
>   drivers/infiniband/core/device.c      | 213 ++++++++++++++++++++++++++++++++--
>   drivers/infiniband/sw/rxe/rxe.c       |  38 +-----
>   drivers/infiniband/sw/rxe/rxe.h       |  14 ++-
>   drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
>   drivers/infiniband/sw/rxe/rxe_net.c   |  83 +++----------
>   drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
>   drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 +++----
>   drivers/infiniband/sw/rxe/rxe_verbs.c |   7 --
>   drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
>   include/rdma/ib_verbs.h               |  10 ++
>   11 files changed, 261 insertions(+), 153 deletions(-)
>
> diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
> index cc7535c5e192..1b2575430032 100644
> --- a/drivers/infiniband/core/core_priv.h
> +++ b/drivers/infiniband/core/core_priv.h
> @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
>   #endif
>   
>   struct ib_device *ib_device_get_by_index(u32 ifindex);
> -void ib_device_put(struct ib_device *device);
>   /* RDMA device netlink */
>   void nldev_init(void);
>   void nldev_exit(void);
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 348a7fb1f945..a2523605e02a 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -85,6 +85,9 @@ struct ib_client_data {
>   static DEFINE_MUTEX(device_mutex);
>   static DECLARE_RWSEM(lists_rwsem);
>   
> +/* Used to serialize ib_unregister_driver */
> +static DECLARE_RWSEM(unregister_rwsem);
> +
>   static int ib_security_change(struct notifier_block *nb, unsigned long event,
>   			      void *lsm_data);
>   static void ib_policy_change_task(struct work_struct *work);
> @@ -168,6 +171,7 @@ void ib_device_put(struct ib_device *device)
>   	if (refcount_dec_and_test(&device->refcount))
>   		complete(&device->unreg_completion);
>   }
> +EXPORT_SYMBOL(ib_device_put);
>   
>   static struct ib_device *__ib_device_get_by_name(const char *name)
>   {
> @@ -180,6 +184,27 @@ static struct ib_device *__ib_device_get_by_name(const char *name)
>   	return NULL;
>   }
>   
> +struct ib_device *ib_device_get_by_name(const char *name,
> +					enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *device;
> +
> +	down_read(&lists_rwsem);
> +	device = __ib_device_get_by_name(name);
> +	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
> +	    device->driver_id != driver_id)
> +		device = NULL;
> +
> +	if (device) {
> +		/* Do not return a device if unregistration has started. */
> +		if (!refcount_inc_not_zero(&device->refcount))
> +			device = NULL;
> +	}
> +	up_read(&lists_rwsem);
> +	return device;
> +}
> +EXPORT_SYMBOL(ib_device_get_by_name);
> +
>   int ib_device_rename(struct ib_device *ibdev, const char *name)
>   {
>   	struct ib_device *device;
> @@ -641,28 +666,41 @@ int ib_register_device(struct ib_device *device, const char *name,
>   }
>   EXPORT_SYMBOL(ib_register_device);
>   
> -/**
> - * ib_unregister_device - Unregister an IB device
> - * @device:Device to unregister
> +/* Returns false if the device is already undergoing unregistration in another
> + * thread, and the device pointer may be invalid upon return.
>    *
> - * Unregister an IB device.  All clients will receive a remove callback.
> + * If true is returned then the caller owns a kref associated with the device
> + * (taken from the kref owned by the list).
> + *
> + * Upon return the device cannot be returned by any 'get' functions.
>    */
> -void ib_unregister_device(struct ib_device *device)
> +static bool ib_pre_unregister(struct ib_device *device)
> +{
> +	bool already_removed;
> +
> +	mutex_lock(&device_mutex);
> +	down_write(&lists_rwsem);
> +	already_removed = list_empty(&device->core_list);
Is it possible that there is no elements in device->core_list ?
> +	list_del_init(&device->core_list);
> +	up_write(&lists_rwsem);
> +	mutex_unlock(&device_mutex);
> +
> +	/* Matches the recound_set in ib_alloc_device */
> +	ib_device_put(device);
> +
> +	return !already_removed;
> +}
> +
> +static void __ib_unregister_device(struct ib_device *device)
>   {
>   	struct ib_client_data *context, *tmp;
>   	unsigned long flags;
>   
> -	/*
> -	 * Wait for all netlink command callers to finish working on the
> -	 * device.
> -	 */
> -	ib_device_put(device);
>   	wait_for_completion(&device->unreg_completion);
>   
>   	mutex_lock(&device_mutex);
>   
>   	down_write(&lists_rwsem);
> -	list_del(&device->core_list);
>   	write_lock_irq(&device->client_data_lock);
>   	list_for_each_entry(context, &device->client_data_list, list)
>   		context->going_down = true;
> @@ -696,10 +734,116 @@ void ib_unregister_device(struct ib_device *device)
>   	up_write(&lists_rwsem);
>   
>   	device->reg_state = IB_DEV_UNREGISTERED;
> +
> +	/*
> +	 * Drivers using the new flow may not call ib_dealloc_device except
> +	 * in error unwind prior to registration success.
> +	 */
> +	if (device->driver_unregister) {
> +		device->driver_unregister(device);
> +		ib_dealloc_device(device);
> +	}
> +}
> +
> +/**
> + * ib_unregister_device - Unregister an IB device
> + * @device: The device to unregister
> + *
> + * Unregister an IB device.  All clients will receive a remove callback.
> + *
> + * Callers can call this routine only once, and must protect against
> + * races. Typically it should only be called as part of a remove callback in
> + * an implementation of driver core's struct device_driver and related.
> + *
> + * A driver must choose to use either only ib_unregister_device or
> + * ib_unregister_device_and_put & ib_unregister_driver.
> + */
> +void ib_unregister_device(struct ib_device *device)
> +{
> +	down_read(&unregister_rwsem);
> +	if (ib_pre_unregister(device))
> +		__ib_unregister_device(device);
> +	else
> +		WARN(true, "Driver mixing %s with _and_put", __func__);
> +	up_read(&unregister_rwsem);
>   }
>   EXPORT_SYMBOL(ib_unregister_device);
>   
>   /**
> + * ib_unregister_device_and_put - Unregister a device while holding a 'get'
> + * device: The device to unregister
> + *
> + * This is the same as ib_unregister_device(), except it includes an internal
> + * ib_device_put() that should match a 'get' obtained by the caller.
> + *
> + * It is safe to call this routine concurrently from multiple threads while
> + * holding the 'get', however in this case return from the function does not
> + * guarantee the device is destroyed, only that destruction is in-progress.
> + *
> + * Drivers using this flow MUST use the driver_unregister callback to clean up
> + * their resources associated with the device and dealloc it.
> + */
> +void ib_unregister_device_and_put(struct ib_device *device)
> +{
> +	down_read(&unregister_rwsem);
> +	if (ib_pre_unregister(device)) {
> +		/*
> +		 * Caller's get can be returned now that we hold the kref,
> +		 * otherwise ib_pre_unregister returned the caller's get
> +		 */
> +		ib_device_put(device);
> +		__ib_unregister_device(device);
> +	}
> +	up_read(&unregister_rwsem);
> +}
> +EXPORT_SYMBOL(ib_unregister_device_and_put);
> +
> +/**
> + * ib_unregister_driver - Unregister all IB devices for a driver
> + * @driver_id: The driver to unregister
> + *
> + * This implements a fence for device unregistration. It only returns once all
> + * devices associated with the driver_id have fully completed their
> + * unregistration and returned from ib_unregister_device*().
> + *
> + * If device's are not yet unregistered it goes ahead and starts unregistering
> + * them.
> + *
> + * The caller must ensure that no devices can be be added while this is
> + * running (or at all for the module_unload case)
> + */
> +void ib_unregister_driver(enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *device, *tmp;
> +	LIST_HEAD(driver_devs);
> +
> +	down_write(&unregister_rwsem);
> +	mutex_lock(&device_mutex);
> +	down_write(&lists_rwsem);
> +	list_for_each_entry_safe(device, tmp, &device_list, core_list) {
> +		if (device->driver_id == driver_id)
> +			list_move(&device->core_list, &driver_devs);
> +	}
> +	up_write(&lists_rwsem);
> +
> +	while ((device = list_first_entry_or_null(
> +			&driver_devs, struct ib_device, core_list))) {
> +		mutex_unlock(&device_mutex);
> +		/*
> +		 * We are holding the unregister_rwsem, so it is impossible
> +		 * for another thread to be doing registration.
> +		 */
> +		if (!ib_pre_unregister(device))
> +			WARN(true, "Impossible failure of ib_pre_unregister");
> +		__ib_unregister_device(device);
> +		mutex_lock(&device_mutex);
> +	}
> +	mutex_unlock(&device_mutex);
> +	up_write(&unregister_rwsem);
> +}
> +EXPORT_SYMBOL(ib_unregister_driver);
> +
> +/**
>    * ib_register_client - Register an IB client
>    * @client:Client to register
>    *
> @@ -981,6 +1125,53 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
>   		}
>   }
>   
> +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
> +					  enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *ib_dev;
> +
> +	down_read(&lists_rwsem);
> +	list_for_each_entry(ib_dev, &device_list, core_list) {
> +		unsigned int port;
> +
> +		if (driver_id != RDMA_DRIVER_UNKNOWN &&
> +		    ib_dev->driver_id != driver_id)
> +			continue;
> +
> +		if (!ib_dev->get_netdev)
> +			continue;
> +
> +		for (port = rdma_start_port(ib_dev);
> +		     port <= rdma_end_port(ib_dev); port++) {
> +			struct net_device *idev;
> +
> +			idev = ib_dev->get_netdev(ib_dev, port);
> +			if (idev != ndev) {
> +				if (idev)
> +					dev_put(idev);
> +				continue;
> +			}
> +
> +			/*
> +			 * Do not return a device if unregistration has
> +			 * started.
> +			 */
> +			if (!refcount_inc_not_zero(&ib_dev->refcount)) {
> +				dev_put(idev);
> +				continue;
> +			}
> +
> +			up_read(&lists_rwsem);
> +			dev_put(idev);
> +			return ib_dev;
> +		}
> +	}
> +	up_read(&lists_rwsem);
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL(ib_device_get_by_netdev);
> +
>   /**
>    * ib_enum_all_roce_netdevs - enumerate all RoCE devices
>    * @filter: Should we call the callback?
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 383e65c7bbc0..4069463f9a66 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -50,8 +50,10 @@ static void rxe_cleanup_ports(struct rxe_dev *rxe)
>   /* free resources for a rxe device all objects created for this device must
>    * have been destroyed
>    */
> -static void rxe_cleanup(struct rxe_dev *rxe)
> +void rxe_unregistered(struct ib_device *ib_dev)
>   {
> +	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
> +
>   	rxe_pool_cleanup(&rxe->uc_pool);
>   	rxe_pool_cleanup(&rxe->pd_pool);
>   	rxe_pool_cleanup(&rxe->ah_pool);
> @@ -68,15 +70,6 @@ static void rxe_cleanup(struct rxe_dev *rxe)
>   	crypto_free_shash(rxe->tfm);
>   }
>   
> -/* called when all references have been dropped */
> -void rxe_release(struct kref *kref)
> -{
> -	struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
> -
> -	rxe_cleanup(rxe);
> -	ib_dealloc_device(&rxe->ib_dev);
> -}
> -
>   /* initialize rxe device parameters */
>   static void rxe_init_device_param(struct rxe_dev *rxe)
>   {
> @@ -279,7 +272,6 @@ static int rxe_init(struct rxe_dev *rxe)
>   	spin_lock_init(&rxe->mmap_offset_lock);
>   	spin_lock_init(&rxe->pending_lock);
>   	INIT_LIST_HEAD(&rxe->pending_mmaps);
> -	INIT_LIST_HEAD(&rxe->list);
>   
>   	mutex_init(&rxe->usdev_lock);
>   
> @@ -312,31 +304,13 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
>   {
>   	int err;
>   
> -	kref_init(&rxe->ref_cnt);
> -
>   	err = rxe_init(rxe);
>   	if (err)
> -		goto err1;
> +		return err;
>   
>   	rxe_set_mtu(rxe, mtu);
>   
> -	err = rxe_register_device(rxe);
> -	if (err)
> -		goto err1;
> -
> -	return 0;
> -
> -err1:
> -	rxe_dev_put(rxe);
> -	return err;
> -}
> -
> -/* called by the ifc layer to remove a device */
> -void rxe_remove(struct rxe_dev *rxe)
> -{
> -	rxe_unregister_device(rxe);
> -
> -	rxe_dev_put(rxe);
> +	return rxe_register_device(rxe);
>   }
>   
>   static int __init rxe_module_init(void)
> @@ -360,7 +334,7 @@ static int __init rxe_module_init(void)
>   
>   static void __exit rxe_module_exit(void)
>   {
> -	rxe_remove_all();
> +	ib_unregister_driver(RDMA_DRIVER_RXE);
>   	rxe_net_exit();
>   	rxe_cache_exit();
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
> index 8f79bd86d033..75e7113a756d 100644
> --- a/drivers/infiniband/sw/rxe/rxe.h
> +++ b/drivers/infiniband/sw/rxe/rxe.h
> @@ -96,17 +96,19 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
>   void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
>   
>   int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
> -void rxe_remove(struct rxe_dev *rxe);
> -void rxe_remove_all(void);
>   
>   void rxe_rcv(struct sk_buff *skb);
>   
> -static inline void rxe_dev_put(struct rxe_dev *rxe)
> +/* The caller must do a matching ib_device_put(&dev->ib_dev) */
> +static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
>   {
> -	kref_put(&rxe->ref_cnt, rxe_release);
> +	struct ib_device *ibdev =
> +		ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
> +
> +	if (!ibdev)
> +		return NULL;
> +	return container_of(ibdev, struct rxe_dev, ib_dev);
>   }
> -struct rxe_dev *net_to_rxe(struct net_device *ndev);
> -struct rxe_dev *get_rxe_by_name(const char *name);
>   
>   void rxe_port_up(struct rxe_dev *rxe);
>   void rxe_port_down(struct rxe_dev *rxe);
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index a675c9f2b427..e830d56a15c2 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -231,7 +231,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
>   		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
>   		      struct rxe_modify_srq_cmd *ucmd);
>   
> -void rxe_release(struct kref *kref);
> +void rxe_unregistered(struct ib_device *ib_dev);
>   
>   int rxe_completer(void *arg);
>   int rxe_requester(void *arg);
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index b26a8141f3ed..c7637ad5e584 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -45,43 +45,6 @@
>   #include "rxe_net.h"
>   #include "rxe_loc.h"
>   
> -static LIST_HEAD(rxe_dev_list);
> -static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
> -
> -struct rxe_dev *net_to_rxe(struct net_device *ndev)
> -{
> -	struct rxe_dev *rxe;
> -	struct rxe_dev *found = NULL;
> -
> -	spin_lock_bh(&dev_list_lock);
> -	list_for_each_entry(rxe, &rxe_dev_list, list) {
> -		if (rxe->ndev == ndev) {
> -			found = rxe;
> -			break;
> -		}
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -
> -	return found;
> -}
> -
> -struct rxe_dev *get_rxe_by_name(const char *name)
> -{
> -	struct rxe_dev *rxe;
> -	struct rxe_dev *found = NULL;
> -
> -	spin_lock_bh(&dev_list_lock);
> -	list_for_each_entry(rxe, &rxe_dev_list, list) {
> -		if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) {
> -			found = rxe;
> -			break;
> -		}
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -	return found;
> -}
> -
> -
>   static struct rxe_recv_sockets recv_sockets;
>   
>   struct device *rxe_dma_device(struct rxe_dev *rxe)
> @@ -229,18 +192,19 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>   	struct udphdr *udph;
>   	struct net_device *ndev = skb->dev;
>   	struct net_device *rdev = ndev;
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
>   	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
>   
>   	if (!rxe && is_vlan_dev(rdev)) {
>   		rdev = vlan_dev_real_dev(ndev);
> -		rxe = net_to_rxe(rdev);
> +		rxe = rxe_get_dev_from_net(rdev);
>   	}
>   	if (!rxe)
>   		goto drop;
>   
>   	if (skb_linearize(skb)) {
>   		pr_err("skb_linearize failed\n");
> +		ib_device_put(&rxe->ib_dev);
>   		goto drop;
>   	}
>   
> @@ -253,6 +217,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>   
>   	rxe_rcv(skb);
>   
> +	/*
> +	 * FIXME: this is in the wrong place, it needs to be done when pkt is
> +	 * destroyed
> +	 */
> +	ib_device_put(&rxe->ib_dev);
> +
>   	return 0;
>   drop:
>   	kfree_skb(skb);
> @@ -558,36 +528,18 @@ struct rxe_dev *rxe_net_add(struct net_device *ndev)
>   	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
>   	if (!rxe)
>   		return NULL;
> -
> +	rxe->ib_dev.driver_unregister = rxe_unregistered;
>   	rxe->ndev = ndev;
>   
>   	err = rxe_add(rxe, ndev->mtu);
>   	if (err) {
> -		ib_dealloc_device(&rxe->ib_dev);
> +		rxe_unregistered(&rxe->ib_dev);
>   		return NULL;
>   	}
>   
> -	spin_lock_bh(&dev_list_lock);
> -	list_add_tail(&rxe->list, &rxe_dev_list);
> -	spin_unlock_bh(&dev_list_lock);
>   	return rxe;
>   }
>   
> -void rxe_remove_all(void)
> -{
> -	spin_lock_bh(&dev_list_lock);
> -	while (!list_empty(&rxe_dev_list)) {
> -		struct rxe_dev *rxe =
> -			list_first_entry(&rxe_dev_list, struct rxe_dev, list);
> -
> -		list_del(&rxe->list);
> -		spin_unlock_bh(&dev_list_lock);
> -		rxe_remove(rxe);
> -		spin_lock_bh(&dev_list_lock);
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -}
> -
>   static void rxe_port_event(struct rxe_dev *rxe,
>   			   enum ib_event_type event)
>   {
> @@ -630,16 +582,16 @@ static int rxe_notify(struct notifier_block *not_blk,
>   		      void *arg)
>   {
>   	struct net_device *ndev = netdev_notifier_info_to_dev(arg);
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
>   
>   	if (!rxe)
> -		goto out;
> +		return NOTIFY_OK;
>   
>   	switch (event) {
>   	case NETDEV_UNREGISTER:
> -		list_del(&rxe->list);
> -		rxe_remove(rxe);
> -		break;
> +		ib_unregister_device_and_put(&rxe->ib_dev);
> +		return NOTIFY_OK;
> +
>   	case NETDEV_UP:
>   		rxe_port_up(rxe);
>   		break;
> @@ -666,7 +618,8 @@ static int rxe_notify(struct notifier_block *not_blk,
>   			event, ndev->name);
>   		break;
>   	}
> -out:
> +
> +	ib_device_put(&rxe->ib_dev);
>   	return NOTIFY_OK;
>   }
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> index a04a076e03af..bb7e3268713e 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> @@ -390,8 +390,6 @@ void *rxe_alloc(struct rxe_pool *pool)
>   	kref_get(&pool->ref_cnt);
>   	read_unlock_irqrestore(&pool->pool_lock, flags);
>   
> -	kref_get(&pool->rxe->ref_cnt);
> -
>   	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>   		goto out_put_pool;
>   
> @@ -408,7 +406,6 @@ void *rxe_alloc(struct rxe_pool *pool)
>   
>   out_put_pool:
>   	atomic_dec(&pool->num_elem);
> -	rxe_dev_put(pool->rxe);
>   	rxe_pool_put(pool);
>   	return NULL;
>   }
> @@ -424,7 +421,6 @@ void rxe_elem_release(struct kref *kref)
>   
>   	kmem_cache_free(pool_cache(pool), elem);
>   	atomic_dec(&pool->num_elem);
> -	rxe_dev_put(pool->rxe);
>   	rxe_pool_put(pool);
>   }
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
> index 73a19f808e1b..87c17675cf9d 100644
> --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
> @@ -53,20 +53,14 @@ static int sanitize_arg(const char *val, char *intf, int intf_len)
>   	return len;
>   }
>   
> -static void rxe_set_port_state(struct net_device *ndev)
> +static void rxe_set_port_state(struct rxe_dev *rxe, struct net_device *ndev)
>   {
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
>   	bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
>   
> -	if (!rxe)
> -		goto out;
> -
>   	if (is_up)
>   		rxe_port_up(rxe);
>   	else
>   		rxe_port_down(rxe); /* down for unknown state */
> -out:
> -	return;
>   }
>   
>   static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
> @@ -74,24 +68,25 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
>   	int len;
>   	int err = 0;
>   	char intf[32];
> -	struct net_device *ndev = NULL;
> +	struct net_device *ndev;
> +	struct rxe_dev *exists;
>   	struct rxe_dev *rxe;
>   
>   	len = sanitize_arg(val, intf, sizeof(intf));
>   	if (!len) {
>   		pr_err("add: invalid interface name\n");
> -		err = -EINVAL;
> -		goto err;
> +		return -EINVAL;
>   	}
>   
>   	ndev = dev_get_by_name(&init_net, intf);
>   	if (!ndev) {
>   		pr_err("interface %s not found\n", intf);
> -		err = -EINVAL;
> -		goto err;
> +		return -EINVAL;
>   	}
>   
> -	if (net_to_rxe(ndev)) {
> +	exists = rxe_get_dev_from_net(ndev);
> +	if (exists) {
> +		ib_device_put(&exists->ib_dev);
>   		pr_err("already configured on %s\n", intf);
>   		err = -EINVAL;
>   		goto err;
> @@ -104,11 +99,11 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
>   		goto err;
>   	}
>   
> -	rxe_set_port_state(ndev);
> +	rxe_set_port_state(rxe, ndev);
>   	dev_info(&rxe->ib_dev.dev, "added %s\n", intf);
> +
>   err:
> -	if (ndev)
> -		dev_put(ndev);
> +	dev_put(ndev);
>   	return err;
>   }
>   
> @@ -116,7 +111,7 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
>   {
>   	int len;
>   	char intf[32];
> -	struct rxe_dev *rxe;
> +	struct ib_device *ib_dev;
>   
>   	len = sanitize_arg(val, intf, sizeof(intf));
>   	if (!len) {
> @@ -126,20 +121,18 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
>   
>   	if (strncmp("all", intf, len) == 0) {
>   		pr_info("rxe_sys: remove all");
> -		rxe_remove_all();
> +		ib_unregister_driver(RDMA_DRIVER_RXE);
>   		return 0;
>   	}
>   
> -	rxe = get_rxe_by_name(intf);
> +	ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE);
>   
> -	if (!rxe) {
> +	if (!ib_dev) {
>   		pr_err("not configured on %s\n", intf);
>   		return -EINVAL;
>   	}
>   
> -	list_del(&rxe->list);
> -	rxe_remove(rxe);
> -
> +	ib_unregister_device_and_put(ib_dev);
>   	return 0;
>   }
>   
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 30817c79ba96..28beee8f5838 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1286,10 +1286,3 @@ int rxe_register_device(struct rxe_dev *rxe)
>   
>   	return err;
>   }
> -
> -void rxe_unregister_device(struct rxe_dev *rxe)
> -{
> -	struct ib_device *dev = &rxe->ib_dev;
> -
> -	ib_unregister_device(dev);
> -}
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 831381b7788d..a0b635812784 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -385,7 +385,6 @@ struct rxe_dev {
>   	struct ib_device_attr	attr;
>   	int			max_ucontext;
>   	int			max_inline_data;
> -	struct kref		ref_cnt;
>   	struct mutex	usdev_lock;
>   
>   	struct net_device	*ndev;
> @@ -412,7 +411,6 @@ struct rxe_dev {
>   	u64			stats_counters[RXE_NUM_OF_COUNTERS];
>   
>   	struct rxe_port		port;
> -	struct list_head	list;
>   	struct crypto_shash	*tfm;
>   };
>   
> @@ -467,7 +465,6 @@ static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
>   }
>   
>   int rxe_register_device(struct rxe_dev *rxe);
> -void rxe_unregister_device(struct rxe_dev *rxe);
>   
>   void rxe_mc_cleanup(struct rxe_pool_entry *arg);
>   
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 85021451eee0..5fbeedacb3aa 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -2534,6 +2534,9 @@ struct ib_device {
>   				 struct ib_counters_read_attr *counters_read_attr,
>   				 struct uverbs_attr_bundle *attrs);
>   
> +	/* Called after all clients have been destroyed*/
> +	void (*driver_unregister)(struct ib_device *dev);
> +
>   	/**
>   	 * rdma netdev operation
>   	 *
> @@ -2653,6 +2656,8 @@ int ib_register_device(struct ib_device *device, const char *name,
>   		       int (*port_callback)(struct ib_device *, u8,
>   					    struct kobject *));
>   void ib_unregister_device(struct ib_device *device);
> +void ib_unregister_driver(enum rdma_driver_id driver_id);
> +void ib_unregister_device_and_put(struct ib_device *device);
>   
>   int ib_register_client   (struct ib_client *client);
>   void ib_unregister_client(struct ib_client *client);
> @@ -3937,6 +3942,11 @@ static inline bool ib_access_writable(int access_flags)
>   int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
>   		       struct ib_mr_status *mr_status);
>   
> +void ib_device_put(struct ib_device *device);
> +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
> +					  enum rdma_driver_id driver_id);
> +struct ib_device *ib_device_get_by_name(const char *name,
> +					enum rdma_driver_id driver_id);
>   struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
>   					    u16 pkey, const union ib_gid *gid,
>   					    const struct sockaddr *addr);
Steve Wise Dec. 17, 2018, 4:28 p.m. UTC | #4
On 12/14/2018 11:16 PM, Zhu Yanjun wrote:
>
> On 2018/12/15 上午12:05, Jason Gunthorpe wrote:
>> Replace
>>   - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
>>   - rxe_dev_list with core's device_list.
>>   - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
>>     lifetime management
>>   - Buggy get_rxe_by_name with ib_device_get_by_name
>>
>> Add
>>   - ib_unregister_driver so drivers like rxe don't have to keep track
>>     of things to do module unload
>>   - ib_unregister_device_and_put to work with potiners returned by
>>     that various ib_device_get_* functions
>>
>> The refcount change to the pool might need more work, but fundamentally
>> the driver cannot have krefs open in the pool across unregistration
>> or it
>> contains module unlolad races.
>>
>> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
>> ---
>>   drivers/infiniband/core/core_priv.h   |   1 -
>>   drivers/infiniband/core/device.c      | 213
>> ++++++++++++++++++++++++++++++++--
>>   drivers/infiniband/sw/rxe/rxe.c       |  38 +-----
>>   drivers/infiniband/sw/rxe/rxe.h       |  14 ++-
>>   drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
>>   drivers/infiniband/sw/rxe/rxe_net.c   |  83 +++----------
>>   drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
>>   drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 +++----
>>   drivers/infiniband/sw/rxe/rxe_verbs.c |   7 --
>>   drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
>>   include/rdma/ib_verbs.h               |  10 ++
>>   11 files changed, 261 insertions(+), 153 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/core_priv.h
>> b/drivers/infiniband/core/core_priv.h
>> index cc7535c5e192..1b2575430032 100644
>> --- a/drivers/infiniband/core/core_priv.h
>> +++ b/drivers/infiniband/core/core_priv.h
>> @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct
>> ib_mad_agent_private *map,
>>   #endif
>>     struct ib_device *ib_device_get_by_index(u32 ifindex);
>> -void ib_device_put(struct ib_device *device);
>>   /* RDMA device netlink */
>>   void nldev_init(void);
>>   void nldev_exit(void);
>> diff --git a/drivers/infiniband/core/device.c
>> b/drivers/infiniband/core/device.c
>> index 348a7fb1f945..a2523605e02a 100644
>> --- a/drivers/infiniband/core/device.c
>> +++ b/drivers/infiniband/core/device.c
>> @@ -85,6 +85,9 @@ struct ib_client_data {
>>   static DEFINE_MUTEX(device_mutex);
>>   static DECLARE_RWSEM(lists_rwsem);
>>   +/* Used to serialize ib_unregister_driver */
>> +static DECLARE_RWSEM(unregister_rwsem);
>> +
>>   static int ib_security_change(struct notifier_block *nb, unsigned
>> long event,
>>                     void *lsm_data);
>>   static void ib_policy_change_task(struct work_struct *work);
>> @@ -168,6 +171,7 @@ void ib_device_put(struct ib_device *device)
>
> Where is this function ib_device_put? This function is used many times.
>
> Zhu Yanjun
>
It is in drivers/infiniband/core/device.c.  It was core-private.  This
patch exports it.

Steve.
Zhu Yanjun Dec. 18, 2018, 1:07 a.m. UTC | #5
On 2018/12/18 0:28, Steve Wise wrote:
> On 12/14/2018 11:16 PM, Zhu Yanjun wrote:
>> On 2018/12/15 上午12:05, Jason Gunthorpe wrote:
>>> Replace
>>>    - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
>>>    - rxe_dev_list with core's device_list.
>>>    - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
>>>      lifetime management
>>>    - Buggy get_rxe_by_name with ib_device_get_by_name
>>>
>>> Add
>>>    - ib_unregister_driver so drivers like rxe don't have to keep track
>>>      of things to do module unload
>>>    - ib_unregister_device_and_put to work with potiners returned by
>>>      that various ib_device_get_* functions
>>>
>>> The refcount change to the pool might need more work, but fundamentally
>>> the driver cannot have krefs open in the pool across unregistration
>>> or it
>>> contains module unlolad races.
>>>
>>> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
>>> ---
>>>    drivers/infiniband/core/core_priv.h   |   1 -
>>>    drivers/infiniband/core/device.c      | 213
>>> ++++++++++++++++++++++++++++++++--
>>>    drivers/infiniband/sw/rxe/rxe.c       |  38 +-----
>>>    drivers/infiniband/sw/rxe/rxe.h       |  14 ++-
>>>    drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
>>>    drivers/infiniband/sw/rxe/rxe_net.c   |  83 +++----------
>>>    drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
>>>    drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 +++----
>>>    drivers/infiniband/sw/rxe/rxe_verbs.c |   7 --
>>>    drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
>>>    include/rdma/ib_verbs.h               |  10 ++
>>>    11 files changed, 261 insertions(+), 153 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/core/core_priv.h
>>> b/drivers/infiniband/core/core_priv.h
>>> index cc7535c5e192..1b2575430032 100644
>>> --- a/drivers/infiniband/core/core_priv.h
>>> +++ b/drivers/infiniband/core/core_priv.h
>>> @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct
>>> ib_mad_agent_private *map,
>>>    #endif
>>>      struct ib_device *ib_device_get_by_index(u32 ifindex);
>>> -void ib_device_put(struct ib_device *device);
>>>    /* RDMA device netlink */
>>>    void nldev_init(void);
>>>    void nldev_exit(void);
>>> diff --git a/drivers/infiniband/core/device.c
>>> b/drivers/infiniband/core/device.c
>>> index 348a7fb1f945..a2523605e02a 100644
>>> --- a/drivers/infiniband/core/device.c
>>> +++ b/drivers/infiniband/core/device.c
>>> @@ -85,6 +85,9 @@ struct ib_client_data {
>>>    static DEFINE_MUTEX(device_mutex);
>>>    static DECLARE_RWSEM(lists_rwsem);
>>>    +/* Used to serialize ib_unregister_driver */
>>> +static DECLARE_RWSEM(unregister_rwsem);
>>> +
>>>    static int ib_security_change(struct notifier_block *nb, unsigned
>>> long event,
>>>                      void *lsm_data);
>>>    static void ib_policy_change_task(struct work_struct *work);
>>> @@ -168,6 +171,7 @@ void ib_device_put(struct ib_device *device)
>> Where is this function ib_device_put? This function is used many times.
>>
>> Zhu Yanjun
>>
> It is in drivers/infiniband/core/device.c.  It was core-private.  This
> patch exports it.

Which repository are you using?

Zhu Yanjun
>
> Steve.
>
>
Jason Gunthorpe Dec. 18, 2018, 2:51 a.m. UTC | #6
On Fri, Dec 14, 2018 at 07:28:19PM -0700, Parav Pandit wrote:

> > -/**
> > - * ib_unregister_device - Unregister an IB device
> > - * @device:Device to unregister
> > +/* Returns false if the device is already undergoing unregistration in
> > +another
> > + * thread, and the device pointer may be invalid upon return.
> >   *
> > - * Unregister an IB device.  All clients will receive a remove callback.
> > + * If true is returned then the caller owns a kref associated with the
> > + device
> > + * (taken from the kref owned by the list).
> > + *
> > + * Upon return the device cannot be returned by any 'get' functions.
> >   */
> > -void ib_unregister_device(struct ib_device *device)
> > +static bool ib_pre_unregister(struct ib_device *device) {
> > +	bool already_removed;
> > +
> > +	mutex_lock(&device_mutex);
> > +	down_write(&lists_rwsem);
> > +	already_removed = list_empty(&device->core_list);
> > +	list_del_init(&device->core_list);
> > +	up_write(&lists_rwsem);
> > +	mutex_unlock(&device_mutex);
> 
> No. you cannot unlock device_mutex here.  Now that device is removed
> from the list, another ib_register_device tries to create device
> with same name and fails to creates sysfs entries.

Hmm. Perhaps this could be a flag then, or perhaps we can have name
assignment check the actual device list..

The same issue is with the list_move below, so this needs a more
careful think.

> > +	/* Matches the recound_set in ib_alloc_device */
> Did you mean refcount set?

yes
 
> > +	while ((device = list_first_entry_or_null(
> > +			&driver_devs, struct ib_device, core_list))) {
> > +		mutex_unlock(&device_mutex);
> > +		/*
> > +		 * We are holding the unregister_rwsem, so it is impossible
> > +		 * for another thread to be doing registration.
> registration or unregistration?

unregistration

> > @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> >  	struct udphdr *udph;
> >  	struct net_device *ndev = skb->dev;
> >  	struct net_device *rdev = ndev;
> > -	struct rxe_dev *rxe = net_to_rxe(ndev);
> > +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
> >  	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
> > 
> >  	if (!rxe && is_vlan_dev(rdev)) {
> >  		rdev = vlan_dev_real_dev(ndev);
> > -		rxe = net_to_rxe(rdev);
> > +		rxe = rxe_get_dev_from_net(rdev);
> 
> Please, let's not lock-unlock a read semaphore on every incoming RoCE udp packet that too in NAPI's non-blocking context.

This is an atomic context? 

That will need a bunch more code to solve properly :\

> > @@ -558,36 +528,18 @@ struct rxe_dev *rxe_net_add(struct net_device
> > *ndev)
> >  	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
> >  	if (!rxe)
> >  		return NULL;
> > -
> > +	rxe->ib_dev.driver_unregister = rxe_unregistered;
> >  	rxe->ndev = ndev;
> > 
> >  	err = rxe_add(rxe, ndev->mtu);
> >  	if (err) {
> > -		ib_dealloc_device(&rxe->ib_dev);
> Unbalanced ib_alloc_device and ib_dealloc_device doesn't sound like a good approach.
> There should be a better way to do this.

Er.. that kind of looks wrong actually, this looks like it needs the
dealloc still.

Once the register succeeds the dealloc responsibility falls to the
core code.

> I think driver unload and device unregistration related complexity
> should be left in the rxe driver.  

no, too many drivers have this pattern and all are getting it
wrong. The core code needs to handle more of it for the driver.

Jason
Steve Wise Dec. 18, 2018, 3:34 a.m. UTC | #7
> -----Original Message-----
> From: Yanjun Zhu <yanjun.zhu@oracle.com>
> Sent: Monday, December 17, 2018 7:07 PM
> To: Steve Wise <swise@opengridcomputing.com>; Jason Gunthorpe
> <jgg@mellanox.com>; dledford@redhat.com
> Cc: linux-rdma@vger.kernel.org; BMT@zurich.ibm.com; leon@kernel.org;
> markb@mellanox.com
> Subject: Re: [PATCH v7 1/5] verbs/rxe: Use core services to keep track of RXE
> things
> 
> 
> 
> On 2018/12/18 0:28, Steve Wise wrote:
> > On 12/14/2018 11:16 PM, Zhu Yanjun wrote:
> >> On 2018/12/15 上午12:05, Jason Gunthorpe wrote:
> >>> Replace
> >>>    - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
> >>>    - rxe_dev_list with core's device_list.
> >>>    - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
> >>>      lifetime management
> >>>    - Buggy get_rxe_by_name with ib_device_get_by_name
> >>>
> >>> Add
> >>>    - ib_unregister_driver so drivers like rxe don't have to keep track
> >>>      of things to do module unload
> >>>    - ib_unregister_device_and_put to work with potiners returned by
> >>>      that various ib_device_get_* functions
> >>>
> >>> The refcount change to the pool might need more work, but
> fundamentally
> >>> the driver cannot have krefs open in the pool across unregistration
> >>> or it
> >>> contains module unlolad races.
> >>>
> >>> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> >>> ---
> >>>    drivers/infiniband/core/core_priv.h   |   1 -
> >>>    drivers/infiniband/core/device.c      | 213
> >>> ++++++++++++++++++++++++++++++++--
> >>>    drivers/infiniband/sw/rxe/rxe.c       |  38 +-----
> >>>    drivers/infiniband/sw/rxe/rxe.h       |  14 ++-
> >>>    drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
> >>>    drivers/infiniband/sw/rxe/rxe_net.c   |  83 +++----------
> >>>    drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
> >>>    drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 +++----
> >>>    drivers/infiniband/sw/rxe/rxe_verbs.c |   7 --
> >>>    drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
> >>>    include/rdma/ib_verbs.h               |  10 ++
> >>>    11 files changed, 261 insertions(+), 153 deletions(-)
> >>>
> >>> diff --git a/drivers/infiniband/core/core_priv.h
> >>> b/drivers/infiniband/core/core_priv.h
> >>> index cc7535c5e192..1b2575430032 100644
> >>> --- a/drivers/infiniband/core/core_priv.h
> >>> +++ b/drivers/infiniband/core/core_priv.h
> >>> @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct
> >>> ib_mad_agent_private *map,
> >>>    #endif
> >>>      struct ib_device *ib_device_get_by_index(u32 ifindex);
> >>> -void ib_device_put(struct ib_device *device);
> >>>    /* RDMA device netlink */
> >>>    void nldev_init(void);
> >>>    void nldev_exit(void);
> >>> diff --git a/drivers/infiniband/core/device.c
> >>> b/drivers/infiniband/core/device.c
> >>> index 348a7fb1f945..a2523605e02a 100644
> >>> --- a/drivers/infiniband/core/device.c
> >>> +++ b/drivers/infiniband/core/device.c
> >>> @@ -85,6 +85,9 @@ struct ib_client_data {
> >>>    static DEFINE_MUTEX(device_mutex);
> >>>    static DECLARE_RWSEM(lists_rwsem);
> >>>    +/* Used to serialize ib_unregister_driver */
> >>> +static DECLARE_RWSEM(unregister_rwsem);
> >>> +
> >>>    static int ib_security_change(struct notifier_block *nb, unsigned
> >>> long event,
> >>>                      void *lsm_data);
> >>>    static void ib_policy_change_task(struct work_struct *work);
> >>> @@ -168,6 +171,7 @@ void ib_device_put(struct ib_device *device)
> >> Where is this function ib_device_put? This function is used many times.
> >>
> >> Zhu Yanjun
> >>
> > It is in drivers/infiniband/core/device.c.  It was core-private.  This
> > patch exports it.
> 
> Which repository are you using?
> 

for-next branch of the rdma repo.
Zhu Yanjun Dec. 18, 2018, 3:36 a.m. UTC | #8
On 2018/12/18 11:34, Steve Wise wrote:
>
>> -----Original Message-----
>> From: Yanjun Zhu <yanjun.zhu@oracle.com>
>> Sent: Monday, December 17, 2018 7:07 PM
>> To: Steve Wise <swise@opengridcomputing.com>; Jason Gunthorpe
>> <jgg@mellanox.com>; dledford@redhat.com
>> Cc: linux-rdma@vger.kernel.org; BMT@zurich.ibm.com; leon@kernel.org;
>> markb@mellanox.com
>> Subject: Re: [PATCH v7 1/5] verbs/rxe: Use core services to keep track of RXE
>> things
>>
>>
>>
>> On 2018/12/18 0:28, Steve Wise wrote:
>>> On 12/14/2018 11:16 PM, Zhu Yanjun wrote:
>>>> On 2018/12/15 上午12:05, Jason Gunthorpe wrote:
>>>>> Replace
>>>>>     - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
>>>>>     - rxe_dev_list with core's device_list.
>>>>>     - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
>>>>>       lifetime management
>>>>>     - Buggy get_rxe_by_name with ib_device_get_by_name
>>>>>
>>>>> Add
>>>>>     - ib_unregister_driver so drivers like rxe don't have to keep track
>>>>>       of things to do module unload
>>>>>     - ib_unregister_device_and_put to work with potiners returned by
>>>>>       that various ib_device_get_* functions
>>>>>
>>>>> The refcount change to the pool might need more work, but
>> fundamentally
>>>>> the driver cannot have krefs open in the pool across unregistration
>>>>> or it
>>>>> contains module unlolad races.
>>>>>
>>>>> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
>>>>> ---
>>>>>     drivers/infiniband/core/core_priv.h   |   1 -
>>>>>     drivers/infiniband/core/device.c      | 213
>>>>> ++++++++++++++++++++++++++++++++--
>>>>>     drivers/infiniband/sw/rxe/rxe.c       |  38 +-----
>>>>>     drivers/infiniband/sw/rxe/rxe.h       |  14 ++-
>>>>>     drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
>>>>>     drivers/infiniband/sw/rxe/rxe_net.c   |  83 +++----------
>>>>>     drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
>>>>>     drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 +++----
>>>>>     drivers/infiniband/sw/rxe/rxe_verbs.c |   7 --
>>>>>     drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
>>>>>     include/rdma/ib_verbs.h               |  10 ++
>>>>>     11 files changed, 261 insertions(+), 153 deletions(-)
>>>>>
>>>>> diff --git a/drivers/infiniband/core/core_priv.h
>>>>> b/drivers/infiniband/core/core_priv.h
>>>>> index cc7535c5e192..1b2575430032 100644
>>>>> --- a/drivers/infiniband/core/core_priv.h
>>>>> +++ b/drivers/infiniband/core/core_priv.h
>>>>> @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct
>>>>> ib_mad_agent_private *map,
>>>>>     #endif
>>>>>       struct ib_device *ib_device_get_by_index(u32 ifindex);
>>>>> -void ib_device_put(struct ib_device *device);
>>>>>     /* RDMA device netlink */
>>>>>     void nldev_init(void);
>>>>>     void nldev_exit(void);
>>>>> diff --git a/drivers/infiniband/core/device.c
>>>>> b/drivers/infiniband/core/device.c
>>>>> index 348a7fb1f945..a2523605e02a 100644
>>>>> --- a/drivers/infiniband/core/device.c
>>>>> +++ b/drivers/infiniband/core/device.c
>>>>> @@ -85,6 +85,9 @@ struct ib_client_data {
>>>>>     static DEFINE_MUTEX(device_mutex);
>>>>>     static DECLARE_RWSEM(lists_rwsem);
>>>>>     +/* Used to serialize ib_unregister_driver */
>>>>> +static DECLARE_RWSEM(unregister_rwsem);
>>>>> +
>>>>>     static int ib_security_change(struct notifier_block *nb, unsigned
>>>>> long event,
>>>>>                       void *lsm_data);
>>>>>     static void ib_policy_change_task(struct work_struct *work);
>>>>> @@ -168,6 +171,7 @@ void ib_device_put(struct ib_device *device)
>>>> Where is this function ib_device_put? This function is used many times.
>>>>
>>>> Zhu Yanjun
>>>>
>>> It is in drivers/infiniband/core/device.c.  It was core-private.  This
>>> patch exports it.
>> Which repository are you using?
>>
> for-next branch of the rdma repo.
Sure. In linux mainline repo, this function ib_device_put does not exist.:-)

Zhu Yanjun
>
>
diff mbox series

Patch

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index cc7535c5e192..1b2575430032 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -267,7 +267,6 @@  static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
 #endif
 
 struct ib_device *ib_device_get_by_index(u32 ifindex);
-void ib_device_put(struct ib_device *device);
 /* RDMA device netlink */
 void nldev_init(void);
 void nldev_exit(void);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 348a7fb1f945..a2523605e02a 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -85,6 +85,9 @@  struct ib_client_data {
 static DEFINE_MUTEX(device_mutex);
 static DECLARE_RWSEM(lists_rwsem);
 
+/* Used to serialize ib_unregister_driver */
+static DECLARE_RWSEM(unregister_rwsem);
+
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
 			      void *lsm_data);
 static void ib_policy_change_task(struct work_struct *work);
@@ -168,6 +171,7 @@  void ib_device_put(struct ib_device *device)
 	if (refcount_dec_and_test(&device->refcount))
 		complete(&device->unreg_completion);
 }
+EXPORT_SYMBOL(ib_device_put);
 
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
@@ -180,6 +184,27 @@  static struct ib_device *__ib_device_get_by_name(const char *name)
 	return NULL;
 }
 
+struct ib_device *ib_device_get_by_name(const char *name,
+					enum rdma_driver_id driver_id)
+{
+	struct ib_device *device;
+
+	down_read(&lists_rwsem);
+	device = __ib_device_get_by_name(name);
+	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
+	    device->driver_id != driver_id)
+		device = NULL;
+
+	if (device) {
+		/* Do not return a device if unregistration has started. */
+		if (!refcount_inc_not_zero(&device->refcount))
+			device = NULL;
+	}
+	up_read(&lists_rwsem);
+	return device;
+}
+EXPORT_SYMBOL(ib_device_get_by_name);
+
 int ib_device_rename(struct ib_device *ibdev, const char *name)
 {
 	struct ib_device *device;
@@ -641,28 +666,41 @@  int ib_register_device(struct ib_device *device, const char *name,
 }
 EXPORT_SYMBOL(ib_register_device);
 
-/**
- * ib_unregister_device - Unregister an IB device
- * @device:Device to unregister
+/* Returns false if the device is already undergoing unregistration in another
+ * thread, and the device pointer may be invalid upon return.
  *
- * Unregister an IB device.  All clients will receive a remove callback.
+ * If true is returned then the caller owns a kref associated with the device
+ * (taken from the kref owned by the list).
+ *
+ * Upon return the device cannot be returned by any 'get' functions.
  */
-void ib_unregister_device(struct ib_device *device)
+static bool ib_pre_unregister(struct ib_device *device)
+{
+	bool already_removed;
+
+	mutex_lock(&device_mutex);
+	down_write(&lists_rwsem);
+	already_removed = list_empty(&device->core_list);
+	list_del_init(&device->core_list);
+	up_write(&lists_rwsem);
+	mutex_unlock(&device_mutex);
+
+	/* Matches the recound_set in ib_alloc_device */
+	ib_device_put(device);
+
+	return !already_removed;
+}
+
+static void __ib_unregister_device(struct ib_device *device)
 {
 	struct ib_client_data *context, *tmp;
 	unsigned long flags;
 
-	/*
-	 * Wait for all netlink command callers to finish working on the
-	 * device.
-	 */
-	ib_device_put(device);
 	wait_for_completion(&device->unreg_completion);
 
 	mutex_lock(&device_mutex);
 
 	down_write(&lists_rwsem);
-	list_del(&device->core_list);
 	write_lock_irq(&device->client_data_lock);
 	list_for_each_entry(context, &device->client_data_list, list)
 		context->going_down = true;
@@ -696,10 +734,116 @@  void ib_unregister_device(struct ib_device *device)
 	up_write(&lists_rwsem);
 
 	device->reg_state = IB_DEV_UNREGISTERED;
+
+	/*
+	 * Drivers using the new flow may not call ib_dealloc_device except
+	 * in error unwind prior to registration success.
+	 */
+	if (device->driver_unregister) {
+		device->driver_unregister(device);
+		ib_dealloc_device(device);
+	}
+}
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device: The device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ *
+ * Callers can call this routine only once, and must protect against
+ * races. Typically it should only be called as part of a remove callback in
+ * an implementation of driver core's struct device_driver and related.
+ *
+ * A driver must choose to use either only ib_unregister_device or
+ * ib_unregister_device_and_put & ib_unregister_driver.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+	down_read(&unregister_rwsem);
+	if (ib_pre_unregister(device))
+		__ib_unregister_device(device);
+	else
+		WARN(true, "Driver mixing %s with _and_put", __func__);
+	up_read(&unregister_rwsem);
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
 /**
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
+ * device: The device to unregister
+ *
+ * This is the same as ib_unregister_device(), except it includes an internal
+ * ib_device_put() that should match a 'get' obtained by the caller.
+ *
+ * It is safe to call this routine concurrently from multiple threads while
+ * holding the 'get', however in this case return from the function does not
+ * guarantee the device is destroyed, only that destruction is in-progress.
+ *
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
+ * their resources associated with the device and dealloc it.
+ */
+void ib_unregister_device_and_put(struct ib_device *device)
+{
+	down_read(&unregister_rwsem);
+	if (ib_pre_unregister(device)) {
+		/*
+		 * Caller's get can be returned now that we hold the kref,
+		 * otherwise ib_pre_unregister returned the caller's get
+		 */
+		ib_device_put(device);
+		__ib_unregister_device(device);
+	}
+	up_read(&unregister_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_device_and_put);
+
+/**
+ * ib_unregister_driver - Unregister all IB devices for a driver
+ * @driver_id: The driver to unregister
+ *
+ * This implements a fence for device unregistration. It only returns once all
+ * devices associated with the driver_id have fully completed their
+ * unregistration and returned from ib_unregister_device*().
+ *
+ * If device's are not yet unregistered it goes ahead and starts unregistering
+ * them.
+ *
+ * The caller must ensure that no devices can be be added while this is
+ * running (or at all for the module_unload case)
+ */
+void ib_unregister_driver(enum rdma_driver_id driver_id)
+{
+	struct ib_device *device, *tmp;
+	LIST_HEAD(driver_devs);
+
+	down_write(&unregister_rwsem);
+	mutex_lock(&device_mutex);
+	down_write(&lists_rwsem);
+	list_for_each_entry_safe(device, tmp, &device_list, core_list) {
+		if (device->driver_id == driver_id)
+			list_move(&device->core_list, &driver_devs);
+	}
+	up_write(&lists_rwsem);
+
+	while ((device = list_first_entry_or_null(
+			&driver_devs, struct ib_device, core_list))) {
+		mutex_unlock(&device_mutex);
+		/*
+		 * We are holding the unregister_rwsem, so it is impossible
+		 * for another thread to be doing registration.
+		 */
+		if (!ib_pre_unregister(device))
+			WARN(true, "Impossible failure of ib_pre_unregister");
+		__ib_unregister_device(device);
+		mutex_lock(&device_mutex);
+	}
+	mutex_unlock(&device_mutex);
+	up_write(&unregister_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_driver);
+
+/**
  * ib_register_client - Register an IB client
  * @client:Client to register
  *
@@ -981,6 +1125,53 @@  void ib_enum_roce_netdev(struct ib_device *ib_dev,
 		}
 }
 
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id)
+{
+	struct ib_device *ib_dev;
+
+	down_read(&lists_rwsem);
+	list_for_each_entry(ib_dev, &device_list, core_list) {
+		unsigned int port;
+
+		if (driver_id != RDMA_DRIVER_UNKNOWN &&
+		    ib_dev->driver_id != driver_id)
+			continue;
+
+		if (!ib_dev->get_netdev)
+			continue;
+
+		for (port = rdma_start_port(ib_dev);
+		     port <= rdma_end_port(ib_dev); port++) {
+			struct net_device *idev;
+
+			idev = ib_dev->get_netdev(ib_dev, port);
+			if (idev != ndev) {
+				if (idev)
+					dev_put(idev);
+				continue;
+			}
+
+			/*
+			 * Do not return a device if unregistration has
+			 * started.
+			 */
+			if (!refcount_inc_not_zero(&ib_dev->refcount)) {
+				dev_put(idev);
+				continue;
+			}
+
+			up_read(&lists_rwsem);
+			dev_put(idev);
+			return ib_dev;
+		}
+	}
+	up_read(&lists_rwsem);
+
+	return NULL;
+}
+EXPORT_SYMBOL(ib_device_get_by_netdev);
+
 /**
  * ib_enum_all_roce_netdevs - enumerate all RoCE devices
  * @filter: Should we call the callback?
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 383e65c7bbc0..4069463f9a66 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -50,8 +50,10 @@  static void rxe_cleanup_ports(struct rxe_dev *rxe)
 /* free resources for a rxe device all objects created for this device must
  * have been destroyed
  */
-static void rxe_cleanup(struct rxe_dev *rxe)
+void rxe_unregistered(struct ib_device *ib_dev)
 {
+	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
 	rxe_pool_cleanup(&rxe->uc_pool);
 	rxe_pool_cleanup(&rxe->pd_pool);
 	rxe_pool_cleanup(&rxe->ah_pool);
@@ -68,15 +70,6 @@  static void rxe_cleanup(struct rxe_dev *rxe)
 	crypto_free_shash(rxe->tfm);
 }
 
-/* called when all references have been dropped */
-void rxe_release(struct kref *kref)
-{
-	struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
-
-	rxe_cleanup(rxe);
-	ib_dealloc_device(&rxe->ib_dev);
-}
-
 /* initialize rxe device parameters */
 static void rxe_init_device_param(struct rxe_dev *rxe)
 {
@@ -279,7 +272,6 @@  static int rxe_init(struct rxe_dev *rxe)
 	spin_lock_init(&rxe->mmap_offset_lock);
 	spin_lock_init(&rxe->pending_lock);
 	INIT_LIST_HEAD(&rxe->pending_mmaps);
-	INIT_LIST_HEAD(&rxe->list);
 
 	mutex_init(&rxe->usdev_lock);
 
@@ -312,31 +304,13 @@  int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
 {
 	int err;
 
-	kref_init(&rxe->ref_cnt);
-
 	err = rxe_init(rxe);
 	if (err)
-		goto err1;
+		return err;
 
 	rxe_set_mtu(rxe, mtu);
 
-	err = rxe_register_device(rxe);
-	if (err)
-		goto err1;
-
-	return 0;
-
-err1:
-	rxe_dev_put(rxe);
-	return err;
-}
-
-/* called by the ifc layer to remove a device */
-void rxe_remove(struct rxe_dev *rxe)
-{
-	rxe_unregister_device(rxe);
-
-	rxe_dev_put(rxe);
+	return rxe_register_device(rxe);
 }
 
 static int __init rxe_module_init(void)
@@ -360,7 +334,7 @@  static int __init rxe_module_init(void)
 
 static void __exit rxe_module_exit(void)
 {
-	rxe_remove_all();
+	ib_unregister_driver(RDMA_DRIVER_RXE);
 	rxe_net_exit();
 	rxe_cache_exit();
 
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
index 8f79bd86d033..75e7113a756d 100644
--- a/drivers/infiniband/sw/rxe/rxe.h
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -96,17 +96,19 @@  static inline u32 rxe_crc32(struct rxe_dev *rxe,
 void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
 
 int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
-void rxe_remove(struct rxe_dev *rxe);
-void rxe_remove_all(void);
 
 void rxe_rcv(struct sk_buff *skb);
 
-static inline void rxe_dev_put(struct rxe_dev *rxe)
+/* The caller must do a matching ib_device_put(&dev->ib_dev) */
+static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
 {
-	kref_put(&rxe->ref_cnt, rxe_release);
+	struct ib_device *ibdev =
+		ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
+
+	if (!ibdev)
+		return NULL;
+	return container_of(ibdev, struct rxe_dev, ib_dev);
 }
-struct rxe_dev *net_to_rxe(struct net_device *ndev);
-struct rxe_dev *get_rxe_by_name(const char *name);
 
 void rxe_port_up(struct rxe_dev *rxe);
 void rxe_port_down(struct rxe_dev *rxe);
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index a675c9f2b427..e830d56a15c2 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -231,7 +231,7 @@  int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
 		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
 		      struct rxe_modify_srq_cmd *ucmd);
 
-void rxe_release(struct kref *kref);
+void rxe_unregistered(struct ib_device *ib_dev);
 
 int rxe_completer(void *arg);
 int rxe_requester(void *arg);
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index b26a8141f3ed..c7637ad5e584 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -45,43 +45,6 @@ 
 #include "rxe_net.h"
 #include "rxe_loc.h"
 
-static LIST_HEAD(rxe_dev_list);
-static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
-
-struct rxe_dev *net_to_rxe(struct net_device *ndev)
-{
-	struct rxe_dev *rxe;
-	struct rxe_dev *found = NULL;
-
-	spin_lock_bh(&dev_list_lock);
-	list_for_each_entry(rxe, &rxe_dev_list, list) {
-		if (rxe->ndev == ndev) {
-			found = rxe;
-			break;
-		}
-	}
-	spin_unlock_bh(&dev_list_lock);
-
-	return found;
-}
-
-struct rxe_dev *get_rxe_by_name(const char *name)
-{
-	struct rxe_dev *rxe;
-	struct rxe_dev *found = NULL;
-
-	spin_lock_bh(&dev_list_lock);
-	list_for_each_entry(rxe, &rxe_dev_list, list) {
-		if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) {
-			found = rxe;
-			break;
-		}
-	}
-	spin_unlock_bh(&dev_list_lock);
-	return found;
-}
-
-
 static struct rxe_recv_sockets recv_sockets;
 
 struct device *rxe_dma_device(struct rxe_dev *rxe)
@@ -229,18 +192,19 @@  static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	struct udphdr *udph;
 	struct net_device *ndev = skb->dev;
 	struct net_device *rdev = ndev;
-	struct rxe_dev *rxe = net_to_rxe(ndev);
+	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
 	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
 
 	if (!rxe && is_vlan_dev(rdev)) {
 		rdev = vlan_dev_real_dev(ndev);
-		rxe = net_to_rxe(rdev);
+		rxe = rxe_get_dev_from_net(rdev);
 	}
 	if (!rxe)
 		goto drop;
 
 	if (skb_linearize(skb)) {
 		pr_err("skb_linearize failed\n");
+		ib_device_put(&rxe->ib_dev);
 		goto drop;
 	}
 
@@ -253,6 +217,12 @@  static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 
 	rxe_rcv(skb);
 
+	/*
+	 * FIXME: this is in the wrong place, it needs to be done when pkt is
+	 * destroyed
+	 */
+	ib_device_put(&rxe->ib_dev);
+
 	return 0;
 drop:
 	kfree_skb(skb);
@@ -558,36 +528,18 @@  struct rxe_dev *rxe_net_add(struct net_device *ndev)
 	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
 	if (!rxe)
 		return NULL;
-
+	rxe->ib_dev.driver_unregister = rxe_unregistered;
 	rxe->ndev = ndev;
 
 	err = rxe_add(rxe, ndev->mtu);
 	if (err) {
-		ib_dealloc_device(&rxe->ib_dev);
+		rxe_unregistered(&rxe->ib_dev);
 		return NULL;
 	}
 
-	spin_lock_bh(&dev_list_lock);
-	list_add_tail(&rxe->list, &rxe_dev_list);
-	spin_unlock_bh(&dev_list_lock);
 	return rxe;
 }
 
-void rxe_remove_all(void)
-{
-	spin_lock_bh(&dev_list_lock);
-	while (!list_empty(&rxe_dev_list)) {
-		struct rxe_dev *rxe =
-			list_first_entry(&rxe_dev_list, struct rxe_dev, list);
-
-		list_del(&rxe->list);
-		spin_unlock_bh(&dev_list_lock);
-		rxe_remove(rxe);
-		spin_lock_bh(&dev_list_lock);
-	}
-	spin_unlock_bh(&dev_list_lock);
-}
-
 static void rxe_port_event(struct rxe_dev *rxe,
 			   enum ib_event_type event)
 {
@@ -630,16 +582,16 @@  static int rxe_notify(struct notifier_block *not_blk,
 		      void *arg)
 {
 	struct net_device *ndev = netdev_notifier_info_to_dev(arg);
-	struct rxe_dev *rxe = net_to_rxe(ndev);
+	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
 
 	if (!rxe)
-		goto out;
+		return NOTIFY_OK;
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
-		list_del(&rxe->list);
-		rxe_remove(rxe);
-		break;
+		ib_unregister_device_and_put(&rxe->ib_dev);
+		return NOTIFY_OK;
+
 	case NETDEV_UP:
 		rxe_port_up(rxe);
 		break;
@@ -666,7 +618,8 @@  static int rxe_notify(struct notifier_block *not_blk,
 			event, ndev->name);
 		break;
 	}
-out:
+
+	ib_device_put(&rxe->ib_dev);
 	return NOTIFY_OK;
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index a04a076e03af..bb7e3268713e 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -390,8 +390,6 @@  void *rxe_alloc(struct rxe_pool *pool)
 	kref_get(&pool->ref_cnt);
 	read_unlock_irqrestore(&pool->pool_lock, flags);
 
-	kref_get(&pool->rxe->ref_cnt);
-
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
 		goto out_put_pool;
 
@@ -408,7 +406,6 @@  void *rxe_alloc(struct rxe_pool *pool)
 
 out_put_pool:
 	atomic_dec(&pool->num_elem);
-	rxe_dev_put(pool->rxe);
 	rxe_pool_put(pool);
 	return NULL;
 }
@@ -424,7 +421,6 @@  void rxe_elem_release(struct kref *kref)
 
 	kmem_cache_free(pool_cache(pool), elem);
 	atomic_dec(&pool->num_elem);
-	rxe_dev_put(pool->rxe);
 	rxe_pool_put(pool);
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
index 73a19f808e1b..87c17675cf9d 100644
--- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -53,20 +53,14 @@  static int sanitize_arg(const char *val, char *intf, int intf_len)
 	return len;
 }
 
-static void rxe_set_port_state(struct net_device *ndev)
+static void rxe_set_port_state(struct rxe_dev *rxe, struct net_device *ndev)
 {
-	struct rxe_dev *rxe = net_to_rxe(ndev);
 	bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
 
-	if (!rxe)
-		goto out;
-
 	if (is_up)
 		rxe_port_up(rxe);
 	else
 		rxe_port_down(rxe); /* down for unknown state */
-out:
-	return;
 }
 
 static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
@@ -74,24 +68,25 @@  static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
 	int len;
 	int err = 0;
 	char intf[32];
-	struct net_device *ndev = NULL;
+	struct net_device *ndev;
+	struct rxe_dev *exists;
 	struct rxe_dev *rxe;
 
 	len = sanitize_arg(val, intf, sizeof(intf));
 	if (!len) {
 		pr_err("add: invalid interface name\n");
-		err = -EINVAL;
-		goto err;
+		return -EINVAL;
 	}
 
 	ndev = dev_get_by_name(&init_net, intf);
 	if (!ndev) {
 		pr_err("interface %s not found\n", intf);
-		err = -EINVAL;
-		goto err;
+		return -EINVAL;
 	}
 
-	if (net_to_rxe(ndev)) {
+	exists = rxe_get_dev_from_net(ndev);
+	if (exists) {
+		ib_device_put(&exists->ib_dev);
 		pr_err("already configured on %s\n", intf);
 		err = -EINVAL;
 		goto err;
@@ -104,11 +99,11 @@  static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
 		goto err;
 	}
 
-	rxe_set_port_state(ndev);
+	rxe_set_port_state(rxe, ndev);
 	dev_info(&rxe->ib_dev.dev, "added %s\n", intf);
+
 err:
-	if (ndev)
-		dev_put(ndev);
+	dev_put(ndev);
 	return err;
 }
 
@@ -116,7 +111,7 @@  static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
 {
 	int len;
 	char intf[32];
-	struct rxe_dev *rxe;
+	struct ib_device *ib_dev;
 
 	len = sanitize_arg(val, intf, sizeof(intf));
 	if (!len) {
@@ -126,20 +121,18 @@  static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
 
 	if (strncmp("all", intf, len) == 0) {
 		pr_info("rxe_sys: remove all");
-		rxe_remove_all();
+		ib_unregister_driver(RDMA_DRIVER_RXE);
 		return 0;
 	}
 
-	rxe = get_rxe_by_name(intf);
+	ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE);
 
-	if (!rxe) {
+	if (!ib_dev) {
 		pr_err("not configured on %s\n", intf);
 		return -EINVAL;
 	}
 
-	list_del(&rxe->list);
-	rxe_remove(rxe);
-
+	ib_unregister_device_and_put(ib_dev);
 	return 0;
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 30817c79ba96..28beee8f5838 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1286,10 +1286,3 @@  int rxe_register_device(struct rxe_dev *rxe)
 
 	return err;
 }
-
-void rxe_unregister_device(struct rxe_dev *rxe)
-{
-	struct ib_device *dev = &rxe->ib_dev;
-
-	ib_unregister_device(dev);
-}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 831381b7788d..a0b635812784 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -385,7 +385,6 @@  struct rxe_dev {
 	struct ib_device_attr	attr;
 	int			max_ucontext;
 	int			max_inline_data;
-	struct kref		ref_cnt;
 	struct mutex	usdev_lock;
 
 	struct net_device	*ndev;
@@ -412,7 +411,6 @@  struct rxe_dev {
 	u64			stats_counters[RXE_NUM_OF_COUNTERS];
 
 	struct rxe_port		port;
-	struct list_head	list;
 	struct crypto_shash	*tfm;
 };
 
@@ -467,7 +465,6 @@  static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
 }
 
 int rxe_register_device(struct rxe_dev *rxe);
-void rxe_unregister_device(struct rxe_dev *rxe);
 
 void rxe_mc_cleanup(struct rxe_pool_entry *arg);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 85021451eee0..5fbeedacb3aa 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2534,6 +2534,9 @@  struct ib_device {
 				 struct ib_counters_read_attr *counters_read_attr,
 				 struct uverbs_attr_bundle *attrs);
 
+	/* Called after all clients have been destroyed*/
+	void (*driver_unregister)(struct ib_device *dev);
+
 	/**
 	 * rdma netdev operation
 	 *
@@ -2653,6 +2656,8 @@  int ib_register_device(struct ib_device *device, const char *name,
 		       int (*port_callback)(struct ib_device *, u8,
 					    struct kobject *));
 void ib_unregister_device(struct ib_device *device);
+void ib_unregister_driver(enum rdma_driver_id driver_id);
+void ib_unregister_device_and_put(struct ib_device *device);
 
 int ib_register_client   (struct ib_client *client);
 void ib_unregister_client(struct ib_client *client);
@@ -3937,6 +3942,11 @@  static inline bool ib_access_writable(int access_flags)
 int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 		       struct ib_mr_status *mr_status);
 
+void ib_device_put(struct ib_device *device);
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id);
+struct ib_device *ib_device_get_by_name(const char *name,
+					enum rdma_driver_id driver_id);
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
 					    u16 pkey, const union ib_gid *gid,
 					    const struct sockaddr *addr);