diff mbox series

[v6,3/4] RDMA/Core: add RDMA_NLDEV_CMD_NEWLINK/DELLINK support

Message ID f73a899cd1ecab37470bb0ad0da61699af0b0775.1544033819.git.swise@opengridcomputing.com (mailing list archive)
State Superseded
Headers show
Series Dynamic rdma link creation | expand

Commit Message

Steve Wise Dec. 5, 2018, 3:14 p.m. UTC
Add support for new LINK messages to allow adding and deleting rdma
interfaces.  This will be used initially for soft rdma drivers which
instantiate device instances dynamically by the admin specifying a
netdev device to use.  The rdma_rxe module will be the first user of
these messages.

The design is modeled after RTNL_NEWLINK/DELLINK:  rdma drivers
register with the rdma core if they provide link add/delete functions.
Each driver registers with a unique "type" string, that is used to
dispatch messages coming from user space.  A new RDMA_NLDEV_ATTR
is defined for the "type" string.  User mode will pass 3 attributes
in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma
device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type"
of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device
interface to use for this link.  The DELLINK message will contain the
RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 134 +++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h          |   2 +
 include/rdma/rdma_netlink.h      |  13 ++++
 include/uapi/rdma/rdma_netlink.h |  11 +++-
 4 files changed, 158 insertions(+), 2 deletions(-)

Comments

Michael J. Ruhl Dec. 5, 2018, 8:14 p.m. UTC | #1
>-----Original Message-----
>From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>owner@vger.kernel.org] On Behalf Of Steve Wise
>Sent: Wednesday, December 5, 2018 10:14 AM
>To: dledford@redhat.com; jgg@mellanox.com
>Cc: linux-rdma@vger.kernel.org; BMT@zurich.ibm.com; leon@kernel.org;
>markb@mellanox.com; yanjun.zhu@oracle.com
>Subject: [PATCH v6 3/4] RDMA/Core: add
>RDMA_NLDEV_CMD_NEWLINK/DELLINK support
>
>Add support for new LINK messages to allow adding and deleting rdma
>interfaces.  This will be used initially for soft rdma drivers which
>instantiate device instances dynamically by the admin specifying a
>netdev device to use.  The rdma_rxe module will be the first user of
>these messages.
>
>The design is modeled after RTNL_NEWLINK/DELLINK:  rdma drivers
>register with the rdma core if they provide link add/delete functions.
>Each driver registers with a unique "type" string, that is used to
>dispatch messages coming from user space.  A new RDMA_NLDEV_ATTR
>is defined for the "type" string.  User mode will pass 3 attributes
>in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired
>rdma
>device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type"
>of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the
>net_device
>interface to use for this link.  The DELLINK message will contain the
>RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete.
>
>Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
>---
> drivers/infiniband/core/nldev.c  | 134
>+++++++++++++++++++++++++++++++++++++++
> include/rdma/ib_verbs.h          |   2 +
> include/rdma/rdma_netlink.h      |  13 ++++
> include/uapi/rdma/rdma_netlink.h |  11 +++-
> 4 files changed, 158 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
>index 9abbadb9e366..b6d97c592074 100644
>--- a/drivers/infiniband/core/nldev.c
>+++ b/drivers/infiniband/core/nldev.c
>@@ -33,6 +33,7 @@
> #include <linux/module.h>
> #include <linux/pid.h>
> #include <linux/pid_namespace.h>
>+#include <linux/mutex.h>
> #include <net/netlink.h>
> #include <rdma/rdma_cm.h>
> #include <rdma/rdma_netlink.h>
>@@ -107,6 +108,8 @@
> 	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
> 	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
> 	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
>+	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type =
>NLA_NUL_STRING,
>+				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN
>},
> };
>
> static int put_driver_name_print_type(struct sk_buff *msg, const char
>*name,
>@@ -1104,6 +1107,129 @@ static int nldev_res_get_pd_dumpit(struct
>sk_buff *skb,
> 	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
> }
>
>+static LIST_HEAD(link_ops);
>+static DECLARE_RWSEM(link_ops_rwsem);
>+
>+static const struct rdma_link_ops *link_ops_get(const char *type)
>+{
>+	const struct rdma_link_ops *ops;
>+
>+	list_for_each_entry(ops, &link_ops, list) {
>+		if (!strcmp(ops->type, type))
>+			goto out;
>+	}
>+	ops = NULL;
>+out:
>+	return ops;
>+}
>+
>+void rdma_link_register(struct rdma_link_ops *ops)
>+{
>+	down_write(&link_ops_rwsem);
>+	if (link_ops_get(ops->type)) {
>+		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
>+		goto out;
>+	}
>+	list_add(&ops->list, &link_ops);
>+out:
>+	up_write(&link_ops_rwsem);
>+}
>+EXPORT_SYMBOL(rdma_link_register);
>+
>+void rdma_link_unregister(struct rdma_link_ops *ops)
>+{
>+	down_write(&link_ops_rwsem);
>+	list_del(&ops->list);
>+	up_write(&link_ops_rwsem);
>+}
>+EXPORT_SYMBOL(rdma_link_unregister);
>+
>+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
>+			  struct netlink_ext_ack *extack)
>+{
>+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>+	char ibdev_name[IB_DEVICE_NAME_MAX];
>+	const struct rdma_link_ops *ops;
>+	struct ib_device *device;
>+	char ndev_name[IFNAMSIZ];
>+	char type[IFNAMSIZ];
>+	int err;
>+
>+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>+			  nldev_policy, extack);
>+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
>+	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] ||
>!tb[RDMA_NLDEV_ATTR_NDEV_NAME])
>+		return -EINVAL;
>+
>+	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
>+		    sizeof(ibdev_name));
>+	if (strchr(ibdev_name, '%'))
>+		return -EINVAL;
>+
>+	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
>+	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
>+		    sizeof(ndev_name));
>+
>+	down_read(&link_ops_rwsem);
>+	ops = link_ops_get(type);
>+#ifdef CONFIG_MODULES
>+	if (!ops) {
>+		up_read(&link_ops_rwsem);
>+		request_module("rdma-link-%s", type);
>+		down_read(&link_ops_rwsem);
>+		ops = link_ops_get(type);
>+	}
>+#endif
>+	if (ops) {
>+		device = ops->newlink(ibdev_name, ndev_name);
>+		if (IS_ERR(device))
>+			err = PTR_ERR(device);
>+		else
>+			device->link_ops = ops;
>+	} else {
>+		err = -ENODEV;
>+	}
>+	up_read(&link_ops_rwsem);
>+
>+	return err;
>+}

The rwsem makes a lot more sense (than the mutex).

Thanks!

If you would like:

Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>

Mike

>+
>+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
>+			  struct netlink_ext_ack *extack)
>+{
>+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>+	const struct rdma_link_ops *ops;
>+	struct device *dma_device;
>+	struct ib_device *device;
>+	u32 index;
>+	int err;
>+
>+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>+			  nldev_policy, extack);
>+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
>+		return -EINVAL;
>+
>+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
>+	device = ib_device_get_by_index(index);
>+	if (!device)
>+		return -EINVAL;
>+
>+	ops = device->link_ops;
>+
>+	/*
>+	 * Deref the ib_device before deleting it.  Otherwise we
>+	 * deadlock unregistering the device.  Hold a ref on the
>+	 * underlying dma_device though to keep the memory around
>+	 * until we're done.
>+	 */
>+	dma_device = get_device(device->dma_device);
>+	ib_device_put(device);
>+	err = ops ? ops->dellink(device) : -ENODEV;
>+	put_device(dma_device);
>+
>+	return err;
>+}
>+
> static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] =
>{
> 	[RDMA_NLDEV_CMD_GET] = {
> 		.doit = nldev_get_doit,
>@@ -1113,6 +1239,14 @@ static int nldev_res_get_pd_dumpit(struct sk_buff
>*skb,
> 		.doit = nldev_set_doit,
> 		.flags = RDMA_NL_ADMIN_PERM,
> 	},
>+	[RDMA_NLDEV_CMD_NEWLINK] = {
>+		.doit = nldev_newlink,
>+		.flags = RDMA_NL_ADMIN_PERM,
>+	},
>+	[RDMA_NLDEV_CMD_DELLINK] = {
>+		.doit = nldev_dellink,
>+		.flags = RDMA_NL_ADMIN_PERM,
>+	},
> 	[RDMA_NLDEV_CMD_PORT_GET] = {
> 		.doit = nldev_port_get_doit,
> 		.dump = nldev_port_get_dumpit,
>diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>index 85021451eee0..237d87c52e3a 100644
>--- a/include/rdma/ib_verbs.h
>+++ b/include/rdma/ib_verbs.h
>@@ -2612,6 +2612,8 @@ struct ib_device {
> 	 */
> 	refcount_t refcount;
> 	struct completion unreg_completion;
>+
>+	const struct rdma_link_ops     *link_ops;
> };
>
> struct ib_client {
>diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
>index 70218e6b5187..9b50e957cbae 100644
>--- a/include/rdma/rdma_netlink.h
>+++ b/include/rdma/rdma_netlink.h
>@@ -99,4 +99,17 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr
>*nlh,
>  * Returns true on success or false if no listeners.
>  */
> bool rdma_nl_chk_listeners(unsigned int group);
>+
>+struct rdma_link_ops {
>+	struct list_head list;
>+	const char *type;
>+	struct ib_device *(*newlink)(const char *ibdev_name,
>+				     const char *ndev_name);
>+	int (*dellink)(struct ib_device *ibdev);
>+};
>+
>+void rdma_link_register(struct rdma_link_ops *ops);
>+void rdma_link_unregister(struct rdma_link_ops *ops);
>+
>+#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-"
>type)
> #endif /* _RDMA_NETLINK_H */
>diff --git a/include/uapi/rdma/rdma_netlink.h
>b/include/uapi/rdma/rdma_netlink.h
>index f9c41bf59efc..dfdfc2b608b8 100644
>--- a/include/uapi/rdma/rdma_netlink.h
>+++ b/include/uapi/rdma/rdma_netlink.h
>@@ -229,9 +229,11 @@ enum rdma_nldev_command {
> 	RDMA_NLDEV_CMD_GET, /* can dump */
> 	RDMA_NLDEV_CMD_SET,
>
>-	/* 3 - 4 are free to use */
>+	RDMA_NLDEV_CMD_NEWLINK,
>
>-	RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */
>+	RDMA_NLDEV_CMD_DELLINK,
>+
>+	RDMA_NLDEV_CMD_PORT_GET, /* can dump */
>
> 	/* 6 - 8 are free to use */
>
>@@ -428,6 +430,11 @@ enum rdma_nldev_attr {
> 	RDMA_NLDEV_ATTR_DRIVER_U64,		/* u64 */
>
> 	/*
>+	 * Identifies the rdma driver. eg: "rxe" or "siw"
>+	 */
>+	RDMA_NLDEV_ATTR_LINK_TYPE,		/* string */
>+
>+	/*
> 	 * Always the end
> 	 */
> 	RDMA_NLDEV_ATTR_MAX
>--
>1.8.3.1
Steve Wise Dec. 6, 2018, 3:44 p.m. UTC | #2
Commenting on my own submission:

On 12/5/2018 9:14 AM, Steve Wise wrote:
> Add support for new LINK messages to allow adding and deleting rdma
> interfaces.  This will be used initially for soft rdma drivers which
> instantiate device instances dynamically by the admin specifying a
> netdev device to use.  The rdma_rxe module will be the first user of
> these messages.
>
> The design is modeled after RTNL_NEWLINK/DELLINK:  rdma drivers
> register with the rdma core if they provide link add/delete functions.
> Each driver registers with a unique "type" string, that is used to
> dispatch messages coming from user space.  A new RDMA_NLDEV_ATTR
> is defined for the "type" string.  User mode will pass 3 attributes
> in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma
> device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type"
> of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device
> interface to use for this link.  The DELLINK message will contain the
> RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete.
>
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
> ---
>  drivers/infiniband/core/nldev.c  | 134 +++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_verbs.h          |   2 +
>  include/rdma/rdma_netlink.h      |  13 ++++
>  include/uapi/rdma/rdma_netlink.h |  11 +++-
>  4 files changed, 158 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
> index 9abbadb9e366..b6d97c592074 100644
> --- a/drivers/infiniband/core/nldev.c
> +++ b/drivers/infiniband/core/nldev.c
> @@ -33,6 +33,7 @@
>  #include <linux/module.h>
>  #include <linux/pid.h>
>  #include <linux/pid_namespace.h>
> +#include <linux/mutex.h>
>  #include <net/netlink.h>
>  #include <rdma/rdma_cm.h>
>  #include <rdma/rdma_netlink.h>
> @@ -107,6 +108,8 @@
>  	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
>  	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
>  	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
> +	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
> +				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
>  };
>  
>  static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
> @@ -1104,6 +1107,129 @@ static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
>  	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
>  }
>  
> +static LIST_HEAD(link_ops);
> +static DECLARE_RWSEM(link_ops_rwsem);
> +
> +static const struct rdma_link_ops *link_ops_get(const char *type)
> +{
> +	const struct rdma_link_ops *ops;
> +
> +	list_for_each_entry(ops, &link_ops, list) {
> +		if (!strcmp(ops->type, type))
> +			goto out;
> +	}
> +	ops = NULL;
> +out:
> +	return ops;
> +}
> +
> +void rdma_link_register(struct rdma_link_ops *ops)
> +{
> +	down_write(&link_ops_rwsem);
> +	if (link_ops_get(ops->type)) {
> +		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
> +		goto out;
> +	}
> +	list_add(&ops->list, &link_ops);
> +out:
> +	up_write(&link_ops_rwsem);
> +}
> +EXPORT_SYMBOL(rdma_link_register);
> +
> +void rdma_link_unregister(struct rdma_link_ops *ops)
> +{
> +	down_write(&link_ops_rwsem);
> +	list_del(&ops->list);
> +	up_write(&link_ops_rwsem);
> +}
> +EXPORT_SYMBOL(rdma_link_unregister);
> +
> +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
> +			  struct netlink_ext_ack *extack)
> +{
> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
> +	char ibdev_name[IB_DEVICE_NAME_MAX];
> +	const struct rdma_link_ops *ops;
> +	struct ib_device *device;
> +	char ndev_name[IFNAMSIZ];
> +	char type[IFNAMSIZ];
> +	int err;
> +
> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
> +			  nldev_policy, extack);
> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
> +	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
> +		return -EINVAL;
> +
> +	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
> +		    sizeof(ibdev_name));
> +	if (strchr(ibdev_name, '%'))
> +		return -EINVAL;
> +
> +	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
> +	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
> +		    sizeof(ndev_name));
> +
> +	down_read(&link_ops_rwsem);
> +	ops = link_ops_get(type);
> +#ifdef CONFIG_MODULES
> +	if (!ops) {
> +		up_read(&link_ops_rwsem);
> +		request_module("rdma-link-%s", type);
> +		down_read(&link_ops_rwsem);
> +		ops = link_ops_get(type);
> +	}
> +#endif
> +	if (ops) {
> +		device = ops->newlink(ibdev_name, ndev_name);
> +		if (IS_ERR(device))
> +			err = PTR_ERR(device);
> +		else
> +			device->link_ops = ops;
> +	} else {
> +		err = -ENODEV;
> +	}
> +	up_read(&link_ops_rwsem);
> +
> +	return err;
> +}
> +
> +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
> +			  struct netlink_ext_ack *extack)
> +{
> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
> +	const struct rdma_link_ops *ops;
> +	struct device *dma_device;
> +	struct ib_device *device;
> +	u32 index;
> +	int err;
> +
> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
> +			  nldev_policy, extack);
> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
> +		return -EINVAL;
> +
> +	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
> +	device = ib_device_get_by_index(index);
> +	if (!device)
> +		return -EINVAL;
> +
> +	ops = device->link_ops;
> +
> +	/*
> +	 * Deref the ib_device before deleting it.  Otherwise we
> +	 * deadlock unregistering the device.  Hold a ref on the
> +	 * underlying dma_device though to keep the memory around
> +	 * until we're done.
> +	 */
> +	dma_device = get_device(device->dma_device);
> +	ib_device_put(device);
> +	err = ops ? ops->dellink(device) : -ENODEV;
> +	put_device(dma_device);
> +
> +	return err;
> +}
> +


There is still a problem with the above code.  If thread 1 is running
this code and gets up to just after ib_device_put() but before calling
ops->dellink, and then thread2 runs the rmmod code of rxe, thread 2 can
remove the device and complete the module unload.  Then thread 1
continues and calls the ops->dellink  function which has been unloaded. 

I think the above code needs a module reference before the
ib_device_put(), and a  module dereference after calling ops->dellink. 

Thoughts?

Steve.
Michael J. Ruhl Dec. 6, 2018, 4:54 p.m. UTC | #3
>-----Original Message-----
>From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>owner@vger.kernel.org] On Behalf Of Steve Wise
>Sent: Thursday, December 6, 2018 10:44 AM
>To: dledford@redhat.com; jgg@mellanox.com
>Cc: linux-rdma@vger.kernel.org; BMT@zurich.ibm.com; leon@kernel.org;
>markb@mellanox.com; yanjun.zhu@oracle.com
>Subject: Re: [PATCH v6 3/4] RDMA/Core: add
>RDMA_NLDEV_CMD_NEWLINK/DELLINK support
>
>Commenting on my own submission:
>
>On 12/5/2018 9:14 AM, Steve Wise wrote:
>> Add support for new LINK messages to allow adding and deleting rdma
>> interfaces.  This will be used initially for soft rdma drivers which
>> instantiate device instances dynamically by the admin specifying a
>> netdev device to use.  The rdma_rxe module will be the first user of
>> these messages.
>>
>> The design is modeled after RTNL_NEWLINK/DELLINK:  rdma drivers
>> register with the rdma core if they provide link add/delete functions.
>> Each driver registers with a unique "type" string, that is used to
>> dispatch messages coming from user space.  A new RDMA_NLDEV_ATTR
>> is defined for the "type" string.  User mode will pass 3 attributes
>> in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired
>rdma
>> device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type"
>> of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the
>net_device
>> interface to use for this link.  The DELLINK message will contain the
>> RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete.
>>
>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>> Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
>> ---
>>  drivers/infiniband/core/nldev.c  | 134
>+++++++++++++++++++++++++++++++++++++++
>>  include/rdma/ib_verbs.h          |   2 +
>>  include/rdma/rdma_netlink.h      |  13 ++++
>>  include/uapi/rdma/rdma_netlink.h |  11 +++-
>>  4 files changed, 158 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/nldev.c
>b/drivers/infiniband/core/nldev.c
>> index 9abbadb9e366..b6d97c592074 100644
>> --- a/drivers/infiniband/core/nldev.c
>> +++ b/drivers/infiniband/core/nldev.c
>> @@ -33,6 +33,7 @@
>>  #include <linux/module.h>
>>  #include <linux/pid.h>
>>  #include <linux/pid_namespace.h>
>> +#include <linux/mutex.h>
>>  #include <net/netlink.h>
>>  #include <rdma/rdma_cm.h>
>>  #include <rdma/rdma_netlink.h>
>> @@ -107,6 +108,8 @@
>>  	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
>>  	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
>>  	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
>> +	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type =
>NLA_NUL_STRING,
>> +				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN
>},
>>  };
>>
>>  static int put_driver_name_print_type(struct sk_buff *msg, const char
>*name,
>> @@ -1104,6 +1107,129 @@ static int nldev_res_get_pd_dumpit(struct
>sk_buff *skb,
>>  	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
>>  }
>>
>> +static LIST_HEAD(link_ops);
>> +static DECLARE_RWSEM(link_ops_rwsem);
>> +
>> +static const struct rdma_link_ops *link_ops_get(const char *type)
>> +{
>> +	const struct rdma_link_ops *ops;
>> +
>> +	list_for_each_entry(ops, &link_ops, list) {
>> +		if (!strcmp(ops->type, type))
>> +			goto out;
>> +	}
>> +	ops = NULL;
>> +out:
>> +	return ops;
>> +}
>> +
>> +void rdma_link_register(struct rdma_link_ops *ops)
>> +{
>> +	down_write(&link_ops_rwsem);
>> +	if (link_ops_get(ops->type)) {
>> +		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
>> +		goto out;
>> +	}
>> +	list_add(&ops->list, &link_ops);
>> +out:
>> +	up_write(&link_ops_rwsem);
>> +}
>> +EXPORT_SYMBOL(rdma_link_register);
>> +
>> +void rdma_link_unregister(struct rdma_link_ops *ops)
>> +{
>> +	down_write(&link_ops_rwsem);
>> +	list_del(&ops->list);
>> +	up_write(&link_ops_rwsem);
>> +}
>> +EXPORT_SYMBOL(rdma_link_unregister);
>> +
>> +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
>> +			  struct netlink_ext_ack *extack)
>> +{
>> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>> +	char ibdev_name[IB_DEVICE_NAME_MAX];
>> +	const struct rdma_link_ops *ops;
>> +	struct ib_device *device;
>> +	char ndev_name[IFNAMSIZ];
>> +	char type[IFNAMSIZ];
>> +	int err;
>> +
>> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>> +			  nldev_policy, extack);
>> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
>> +	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] ||
>!tb[RDMA_NLDEV_ATTR_NDEV_NAME])
>> +		return -EINVAL;
>> +
>> +	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
>> +		    sizeof(ibdev_name));
>> +	if (strchr(ibdev_name, '%'))
>> +		return -EINVAL;
>> +
>> +	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
>> +	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
>> +		    sizeof(ndev_name));
>> +
>> +	down_read(&link_ops_rwsem);
>> +	ops = link_ops_get(type);
>> +#ifdef CONFIG_MODULES
>> +	if (!ops) {
>> +		up_read(&link_ops_rwsem);
>> +		request_module("rdma-link-%s", type);
>> +		down_read(&link_ops_rwsem);
>> +		ops = link_ops_get(type);
>> +	}
>> +#endif
>> +	if (ops) {
>> +		device = ops->newlink(ibdev_name, ndev_name);
>> +		if (IS_ERR(device))
>> +			err = PTR_ERR(device);
>> +		else
>> +			device->link_ops = ops;
>> +	} else {
>> +		err = -ENODEV;
>> +	}
>> +	up_read(&link_ops_rwsem);
>> +
>> +	return err;
>> +}
>> +
>> +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
>> +			  struct netlink_ext_ack *extack)
>> +{
>> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>> +	const struct rdma_link_ops *ops;
>> +	struct device *dma_device;
>> +	struct ib_device *device;
>> +	u32 index;
>> +	int err;
>> +
>> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>> +			  nldev_policy, extack);
>> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
>> +		return -EINVAL;
>> +
>> +	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
>> +	device = ib_device_get_by_index(index);
>> +	if (!device)
>> +		return -EINVAL;
>> +
>> +	ops = device->link_ops;
>> +
>> +	/*
>> +	 * Deref the ib_device before deleting it.  Otherwise we
>> +	 * deadlock unregistering the device.  Hold a ref on the
>> +	 * underlying dma_device though to keep the memory around
>> +	 * until we're done.
>> +	 */
>> +	dma_device = get_device(device->dma_device);
>> +	ib_device_put(device);
>> +	err = ops ? ops->dellink(device) : -ENODEV;
>> +	put_device(dma_device);
>> +
>> +	return err;
>> +}
>> +
>
>
>There is still a problem with the above code.  If thread 1 is running
>this code and gets up to just after ib_device_put() but before calling
>ops->dellink, and then thread2 runs the rmmod code of rxe, thread 2 can
>remove the device and complete the module unload.  Then thread 1
>continues and calls the ops->dellink  function which has been unloaded.
>
>I think the above code needs a module reference before the
>ib_device_put(), and a  module dereference after calling ops->dellink.
>
>Thoughts?

Would using the rwsem around this accomplish that?

M

>Steve.
>
Steve Wise Dec. 6, 2018, 5:59 p.m. UTC | #4
On 12/6/2018 10:54 AM, Ruhl, Michael J wrote:
>> -----Original Message-----
>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
>> owner@vger.kernel.org] On Behalf Of Steve Wise
>> Sent: Thursday, December 6, 2018 10:44 AM
>> To: dledford@redhat.com; jgg@mellanox.com
>> Cc: linux-rdma@vger.kernel.org; BMT@zurich.ibm.com; leon@kernel.org;
>> markb@mellanox.com; yanjun.zhu@oracle.com
>> Subject: Re: [PATCH v6 3/4] RDMA/Core: add
>> RDMA_NLDEV_CMD_NEWLINK/DELLINK support
>>
>> Commenting on my own submission:
>>
>> On 12/5/2018 9:14 AM, Steve Wise wrote:
>>> Add support for new LINK messages to allow adding and deleting rdma
>>> interfaces.  This will be used initially for soft rdma drivers which
>>> instantiate device instances dynamically by the admin specifying a
>>> netdev device to use.  The rdma_rxe module will be the first user of
>>> these messages.
>>>
>>> The design is modeled after RTNL_NEWLINK/DELLINK:  rdma drivers
>>> register with the rdma core if they provide link add/delete functions.
>>> Each driver registers with a unique "type" string, that is used to
>>> dispatch messages coming from user space.  A new RDMA_NLDEV_ATTR
>>> is defined for the "type" string.  User mode will pass 3 attributes
>>> in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired
>> rdma
>>> device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type"
>>> of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the
>> net_device
>>> interface to use for this link.  The DELLINK message will contain the
>>> RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete.
>>>
>>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>>> Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
>>> ---
>>>  drivers/infiniband/core/nldev.c  | 134
>> +++++++++++++++++++++++++++++++++++++++
>>>  include/rdma/ib_verbs.h          |   2 +
>>>  include/rdma/rdma_netlink.h      |  13 ++++
>>>  include/uapi/rdma/rdma_netlink.h |  11 +++-
>>>  4 files changed, 158 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/core/nldev.c
>> b/drivers/infiniband/core/nldev.c
>>> index 9abbadb9e366..b6d97c592074 100644
>>> --- a/drivers/infiniband/core/nldev.c
>>> +++ b/drivers/infiniband/core/nldev.c
>>> @@ -33,6 +33,7 @@
>>>  #include <linux/module.h>
>>>  #include <linux/pid.h>
>>>  #include <linux/pid_namespace.h>
>>> +#include <linux/mutex.h>
>>>  #include <net/netlink.h>
>>>  #include <rdma/rdma_cm.h>
>>>  #include <rdma/rdma_netlink.h>
>>> @@ -107,6 +108,8 @@
>>>  	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
>>>  	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
>>>  	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
>>> +	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type =
>> NLA_NUL_STRING,
>>> +				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN
>> },
>>>  };
>>>
>>>  static int put_driver_name_print_type(struct sk_buff *msg, const char
>> *name,
>>> @@ -1104,6 +1107,129 @@ static int nldev_res_get_pd_dumpit(struct
>> sk_buff *skb,
>>>  	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
>>>  }
>>>
>>> +static LIST_HEAD(link_ops);
>>> +static DECLARE_RWSEM(link_ops_rwsem);
>>> +
>>> +static const struct rdma_link_ops *link_ops_get(const char *type)
>>> +{
>>> +	const struct rdma_link_ops *ops;
>>> +
>>> +	list_for_each_entry(ops, &link_ops, list) {
>>> +		if (!strcmp(ops->type, type))
>>> +			goto out;
>>> +	}
>>> +	ops = NULL;
>>> +out:
>>> +	return ops;
>>> +}
>>> +
>>> +void rdma_link_register(struct rdma_link_ops *ops)
>>> +{
>>> +	down_write(&link_ops_rwsem);
>>> +	if (link_ops_get(ops->type)) {
>>> +		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
>>> +		goto out;
>>> +	}
>>> +	list_add(&ops->list, &link_ops);
>>> +out:
>>> +	up_write(&link_ops_rwsem);
>>> +}
>>> +EXPORT_SYMBOL(rdma_link_register);
>>> +
>>> +void rdma_link_unregister(struct rdma_link_ops *ops)
>>> +{
>>> +	down_write(&link_ops_rwsem);
>>> +	list_del(&ops->list);
>>> +	up_write(&link_ops_rwsem);
>>> +}
>>> +EXPORT_SYMBOL(rdma_link_unregister);
>>> +
>>> +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
>>> +			  struct netlink_ext_ack *extack)
>>> +{
>>> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>>> +	char ibdev_name[IB_DEVICE_NAME_MAX];
>>> +	const struct rdma_link_ops *ops;
>>> +	struct ib_device *device;
>>> +	char ndev_name[IFNAMSIZ];
>>> +	char type[IFNAMSIZ];
>>> +	int err;
>>> +
>>> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>>> +			  nldev_policy, extack);
>>> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
>>> +	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] ||
>> !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
>>> +		return -EINVAL;
>>> +
>>> +	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
>>> +		    sizeof(ibdev_name));
>>> +	if (strchr(ibdev_name, '%'))
>>> +		return -EINVAL;
>>> +
>>> +	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
>>> +	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
>>> +		    sizeof(ndev_name));
>>> +
>>> +	down_read(&link_ops_rwsem);
>>> +	ops = link_ops_get(type);
>>> +#ifdef CONFIG_MODULES
>>> +	if (!ops) {
>>> +		up_read(&link_ops_rwsem);
>>> +		request_module("rdma-link-%s", type);
>>> +		down_read(&link_ops_rwsem);
>>> +		ops = link_ops_get(type);
>>> +	}
>>> +#endif
>>> +	if (ops) {
>>> +		device = ops->newlink(ibdev_name, ndev_name);
>>> +		if (IS_ERR(device))
>>> +			err = PTR_ERR(device);
>>> +		else
>>> +			device->link_ops = ops;
>>> +	} else {
>>> +		err = -ENODEV;
>>> +	}
>>> +	up_read(&link_ops_rwsem);
>>> +
>>> +	return err;
>>> +}
>>> +
>>> +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
>>> +			  struct netlink_ext_ack *extack)
>>> +{
>>> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>>> +	const struct rdma_link_ops *ops;
>>> +	struct device *dma_device;
>>> +	struct ib_device *device;
>>> +	u32 index;
>>> +	int err;
>>> +
>>> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>>> +			  nldev_policy, extack);
>>> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
>>> +		return -EINVAL;
>>> +
>>> +	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
>>> +	device = ib_device_get_by_index(index);
>>> +	if (!device)
>>> +		return -EINVAL;
>>> +
>>> +	ops = device->link_ops;
>>> +
>>> +	/*
>>> +	 * Deref the ib_device before deleting it.  Otherwise we
>>> +	 * deadlock unregistering the device.  Hold a ref on the
>>> +	 * underlying dma_device though to keep the memory around
>>> +	 * until we're done.
>>> +	 */
>>> +	dma_device = get_device(device->dma_device);
>>> +	ib_device_put(device);
>>> +	err = ops ? ops->dellink(device) : -ENODEV;
>>> +	put_device(dma_device);
>>> +
>>> +	return err;
>>> +}
>>> +
>>
>> There is still a problem with the above code.  If thread 1 is running
>> this code and gets up to just after ib_device_put() but before calling
>> ops->dellink, and then thread2 runs the rmmod code of rxe, thread 2 can
>> remove the device and complete the module unload.  Then thread 1
>> continues and calls the ops->dellink  function which has been unloaded.
>>
>> I think the above code needs a module reference before the
>> ib_device_put(), and a  module dereference after calling ops->dellink.
>>
>> Thoughts?
> Would using the rwsem around this accomplish that?

That wouldn't help, unless nldev_dellink() looked up the link_ops with
the rwsem held, instead of having it cached in the ib_device struct. 
Which is sort of what I had in an earlier version.   But since
nldev_dellink() found the ib_device via ib_device_get_by_index(), we
know the rxe module cannot unload until ib_device_put() is called by
nldev_dellink().  I don't know how to stall the unload until
nldev_dellink() finishes other than a module reference.
Jason Gunthorpe Dec. 6, 2018, 7:26 p.m. UTC | #5
On Thu, Dec 06, 2018 at 09:44:11AM -0600, Steve Wise wrote:
> Commenting on my own submission:
> 
> On 12/5/2018 9:14 AM, Steve Wise wrote:
> > Add support for new LINK messages to allow adding and deleting rdma
> > interfaces.  This will be used initially for soft rdma drivers which
> > instantiate device instances dynamically by the admin specifying a
> > netdev device to use.  The rdma_rxe module will be the first user of
> > these messages.
> >
> > The design is modeled after RTNL_NEWLINK/DELLINK:  rdma drivers
> > register with the rdma core if they provide link add/delete functions.
> > Each driver registers with a unique "type" string, that is used to
> > dispatch messages coming from user space.  A new RDMA_NLDEV_ATTR
> > is defined for the "type" string.  User mode will pass 3 attributes
> > in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma
> > device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type"
> > of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device
> > interface to use for this link.  The DELLINK message will contain the
> > RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete.
> >
> > Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> > Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
> >  drivers/infiniband/core/nldev.c  | 134 +++++++++++++++++++++++++++++++++++++++
> >  include/rdma/ib_verbs.h          |   2 +
> >  include/rdma/rdma_netlink.h      |  13 ++++
> >  include/uapi/rdma/rdma_netlink.h |  11 +++-
> >  4 files changed, 158 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
> > index 9abbadb9e366..b6d97c592074 100644
> > +++ b/drivers/infiniband/core/nldev.c
> > @@ -33,6 +33,7 @@
> >  #include <linux/module.h>
> >  #include <linux/pid.h>
> >  #include <linux/pid_namespace.h>
> > +#include <linux/mutex.h>
> >  #include <net/netlink.h>
> >  #include <rdma/rdma_cm.h>
> >  #include <rdma/rdma_netlink.h>
> > @@ -107,6 +108,8 @@
> >  	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
> >  	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
> >  	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
> > +	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
> > +				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
> >  };
> >  
> >  static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
> > @@ -1104,6 +1107,129 @@ static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
> >  	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
> >  }
> >  
> > +static LIST_HEAD(link_ops);
> > +static DECLARE_RWSEM(link_ops_rwsem);
> > +
> > +static const struct rdma_link_ops *link_ops_get(const char *type)
> > +{
> > +	const struct rdma_link_ops *ops;
> > +
> > +	list_for_each_entry(ops, &link_ops, list) {
> > +		if (!strcmp(ops->type, type))
> > +			goto out;
> > +	}
> > +	ops = NULL;
> > +out:
> > +	return ops;
> > +}
> > +
> > +void rdma_link_register(struct rdma_link_ops *ops)
> > +{
> > +	down_write(&link_ops_rwsem);
> > +	if (link_ops_get(ops->type)) {
> > +		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
> > +		goto out;
> > +	}
> > +	list_add(&ops->list, &link_ops);
> > +out:
> > +	up_write(&link_ops_rwsem);
> > +}
> > +EXPORT_SYMBOL(rdma_link_register);
> > +
> > +void rdma_link_unregister(struct rdma_link_ops *ops)
> > +{
> > +	down_write(&link_ops_rwsem);
> > +	list_del(&ops->list);
> > +	up_write(&link_ops_rwsem);
> > +}
> > +EXPORT_SYMBOL(rdma_link_unregister);
> > +
> > +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
> > +			  struct netlink_ext_ack *extack)
> > +{
> > +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
> > +	char ibdev_name[IB_DEVICE_NAME_MAX];
> > +	const struct rdma_link_ops *ops;
> > +	struct ib_device *device;
> > +	char ndev_name[IFNAMSIZ];
> > +	char type[IFNAMSIZ];
> > +	int err;
> > +
> > +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
> > +			  nldev_policy, extack);
> > +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
> > +	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
> > +		return -EINVAL;
> > +
> > +	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
> > +		    sizeof(ibdev_name));
> > +	if (strchr(ibdev_name, '%'))
> > +		return -EINVAL;
> > +
> > +	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
> > +	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
> > +		    sizeof(ndev_name));
> > +
> > +	down_read(&link_ops_rwsem);
> > +	ops = link_ops_get(type);
> > +#ifdef CONFIG_MODULES
> > +	if (!ops) {
> > +		up_read(&link_ops_rwsem);
> > +		request_module("rdma-link-%s", type);
> > +		down_read(&link_ops_rwsem);
> > +		ops = link_ops_get(type);
> > +	}
> > +#endif
> > +	if (ops) {
> > +		device = ops->newlink(ibdev_name, ndev_name);
> > +		if (IS_ERR(device))
> > +			err = PTR_ERR(device);
> > +		else
> > +			device->link_ops = ops;
> > +	} else {
> > +		err = -ENODEV;
> > +	}
> > +	up_read(&link_ops_rwsem);
> > +
> > +	return err;
> > +}
> > +
> > +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
> > +			  struct netlink_ext_ack *extack)
> > +{
> > +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
> > +	const struct rdma_link_ops *ops;
> > +	struct device *dma_device;
> > +	struct ib_device *device;
> > +	u32 index;
> > +	int err;
> > +
> > +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
> > +			  nldev_policy, extack);
> > +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
> > +		return -EINVAL;
> > +
> > +	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
> > +	device = ib_device_get_by_index(index);
> > +	if (!device)
> > +		return -EINVAL;
> > +
> > +	ops = device->link_ops;
> > +
> > +	/*
> > +	 * Deref the ib_device before deleting it.  Otherwise we
> > +	 * deadlock unregistering the device.  Hold a ref on the
> > +	 * underlying dma_device though to keep the memory around
> > +	 * until we're done.
> > +	 */
> > +	dma_device = get_device(device->dma_device);
> > +	ib_device_put(device);
> > +	err = ops ? ops->dellink(device) : -ENODEV;
> > +	put_device(dma_device);

What is the weirndness with dma_device? It should just be device

> There is still a problem with the above code.  If thread 1 is running
> this code and gets up to just after ib_device_put() but before calling
> ops->dellink, and then thread2 runs the rmmod code of rxe, thread 2 can
> remove the device and complete the module unload.

Yes.. You can't use a lot of  the ib device outside the
ib_device_get_by_index/ib_device_put critical region.

The driver, or perhaps the core code, has to be the one to unput it,
and the unput has to be done under proper locking to resolve the race.

Jason
Steve Wise Dec. 6, 2018, 8:42 p.m. UTC | #6
On 12/6/2018 1:26 PM, Jason Gunthorpe wrote:
> On Thu, Dec 06, 2018 at 09:44:11AM -0600, Steve Wise wrote:
>> Commenting on my own submission:
>>
>> On 12/5/2018 9:14 AM, Steve Wise wrote:
>>> Add support for new LINK messages to allow adding and deleting rdma
>>> interfaces.  This will be used initially for soft rdma drivers which
>>> instantiate device instances dynamically by the admin specifying a
>>> netdev device to use.  The rdma_rxe module will be the first user of
>>> these messages.
>>>
>>> The design is modeled after RTNL_NEWLINK/DELLINK:  rdma drivers
>>> register with the rdma core if they provide link add/delete functions.
>>> Each driver registers with a unique "type" string, that is used to
>>> dispatch messages coming from user space.  A new RDMA_NLDEV_ATTR
>>> is defined for the "type" string.  User mode will pass 3 attributes
>>> in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma
>>> device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type"
>>> of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device
>>> interface to use for this link.  The DELLINK message will contain the
>>> RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete.
>>>
>>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>>> Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
>>>  drivers/infiniband/core/nldev.c  | 134 +++++++++++++++++++++++++++++++++++++++
>>>  include/rdma/ib_verbs.h          |   2 +
>>>  include/rdma/rdma_netlink.h      |  13 ++++
>>>  include/uapi/rdma/rdma_netlink.h |  11 +++-
>>>  4 files changed, 158 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
>>> index 9abbadb9e366..b6d97c592074 100644
>>> +++ b/drivers/infiniband/core/nldev.c
>>> @@ -33,6 +33,7 @@
>>>  #include <linux/module.h>
>>>  #include <linux/pid.h>
>>>  #include <linux/pid_namespace.h>
>>> +#include <linux/mutex.h>
>>>  #include <net/netlink.h>
>>>  #include <rdma/rdma_cm.h>
>>>  #include <rdma/rdma_netlink.h>
>>> @@ -107,6 +108,8 @@
>>>  	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
>>>  	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
>>>  	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
>>> +	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
>>> +				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
>>>  };
>>>  
>>>  static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
>>> @@ -1104,6 +1107,129 @@ static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
>>>  	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
>>>  }
>>>  
>>> +static LIST_HEAD(link_ops);
>>> +static DECLARE_RWSEM(link_ops_rwsem);
>>> +
>>> +static const struct rdma_link_ops *link_ops_get(const char *type)
>>> +{
>>> +	const struct rdma_link_ops *ops;
>>> +
>>> +	list_for_each_entry(ops, &link_ops, list) {
>>> +		if (!strcmp(ops->type, type))
>>> +			goto out;
>>> +	}
>>> +	ops = NULL;
>>> +out:
>>> +	return ops;
>>> +}
>>> +
>>> +void rdma_link_register(struct rdma_link_ops *ops)
>>> +{
>>> +	down_write(&link_ops_rwsem);
>>> +	if (link_ops_get(ops->type)) {
>>> +		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
>>> +		goto out;
>>> +	}
>>> +	list_add(&ops->list, &link_ops);
>>> +out:
>>> +	up_write(&link_ops_rwsem);
>>> +}
>>> +EXPORT_SYMBOL(rdma_link_register);
>>> +
>>> +void rdma_link_unregister(struct rdma_link_ops *ops)
>>> +{
>>> +	down_write(&link_ops_rwsem);
>>> +	list_del(&ops->list);
>>> +	up_write(&link_ops_rwsem);
>>> +}
>>> +EXPORT_SYMBOL(rdma_link_unregister);
>>> +
>>> +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
>>> +			  struct netlink_ext_ack *extack)
>>> +{
>>> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>>> +	char ibdev_name[IB_DEVICE_NAME_MAX];
>>> +	const struct rdma_link_ops *ops;
>>> +	struct ib_device *device;
>>> +	char ndev_name[IFNAMSIZ];
>>> +	char type[IFNAMSIZ];
>>> +	int err;
>>> +
>>> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>>> +			  nldev_policy, extack);
>>> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
>>> +	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
>>> +		return -EINVAL;
>>> +
>>> +	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
>>> +		    sizeof(ibdev_name));
>>> +	if (strchr(ibdev_name, '%'))
>>> +		return -EINVAL;
>>> +
>>> +	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
>>> +	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
>>> +		    sizeof(ndev_name));
>>> +
>>> +	down_read(&link_ops_rwsem);
>>> +	ops = link_ops_get(type);
>>> +#ifdef CONFIG_MODULES
>>> +	if (!ops) {
>>> +		up_read(&link_ops_rwsem);
>>> +		request_module("rdma-link-%s", type);
>>> +		down_read(&link_ops_rwsem);
>>> +		ops = link_ops_get(type);
>>> +	}
>>> +#endif
>>> +	if (ops) {
>>> +		device = ops->newlink(ibdev_name, ndev_name);
>>> +		if (IS_ERR(device))
>>> +			err = PTR_ERR(device);
>>> +		else
>>> +			device->link_ops = ops;
>>> +	} else {
>>> +		err = -ENODEV;
>>> +	}
>>> +	up_read(&link_ops_rwsem);
>>> +
>>> +	return err;
>>> +}
>>> +
>>> +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
>>> +			  struct netlink_ext_ack *extack)
>>> +{
>>> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>>> +	const struct rdma_link_ops *ops;
>>> +	struct device *dma_device;
>>> +	struct ib_device *device;
>>> +	u32 index;
>>> +	int err;
>>> +
>>> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>>> +			  nldev_policy, extack);
>>> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
>>> +		return -EINVAL;
>>> +
>>> +	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
>>> +	device = ib_device_get_by_index(index);
>>> +	if (!device)
>>> +		return -EINVAL;
>>> +
>>> +	ops = device->link_ops;
>>> +
>>> +	/*
>>> +	 * Deref the ib_device before deleting it.  Otherwise we
>>> +	 * deadlock unregistering the device.  Hold a ref on the
>>> +	 * underlying dma_device though to keep the memory around
>>> +	 * until we're done.
>>> +	 */
>>> +	dma_device = get_device(device->dma_device);
>>> +	ib_device_put(device);
>>> +	err = ops ? ops->dellink(device) : -ENODEV;
>>> +	put_device(dma_device);
> What is the weirndness with dma_device? It should just be device

Do you mean ibdev->dev?


>> There is still a problem with the above code.  If thread 1 is running
>> this code and gets up to just after ib_device_put() but before calling
>> ops->dellink, and then thread2 runs the rmmod code of rxe, thread 2 can
>> remove the device and complete the module unload.
> Yes.. You can't use a lot of  the ib device outside the
> ib_device_get_by_index/ib_device_put critical region.
>
> The driver, or perhaps the core code, has to be the one to unput it,
> and the unput has to be done under proper locking to resolve the race.

We could define that the link_ops dellink function must do the final
ib_device_put().  That would work...
Steve Wise Dec. 6, 2018, 9:16 p.m. UTC | #7
On 12/6/2018 2:42 PM, Steve Wise wrote:
> On 12/6/2018 1:26 PM, Jason Gunthorpe wrote:
>> On Thu, Dec 06, 2018 at 09:44:11AM -0600, Steve Wise wrote:
>>> Commenting on my own submission:
>>>
>>> On 12/5/2018 9:14 AM, Steve Wise wrote:
>>>> Add support for new LINK messages to allow adding and deleting rdma
>>>> interfaces.  This will be used initially for soft rdma drivers which
>>>> instantiate device instances dynamically by the admin specifying a
>>>> netdev device to use.  The rdma_rxe module will be the first user of
>>>> these messages.
>>>>
>>>> The design is modeled after RTNL_NEWLINK/DELLINK:  rdma drivers
>>>> register with the rdma core if they provide link add/delete functions.
>>>> Each driver registers with a unique "type" string, that is used to
>>>> dispatch messages coming from user space.  A new RDMA_NLDEV_ATTR
>>>> is defined for the "type" string.  User mode will pass 3 attributes
>>>> in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma
>>>> device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type"
>>>> of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device
>>>> interface to use for this link.  The DELLINK message will contain the
>>>> RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete.
>>>>
>>>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>>>> Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
>>>>  drivers/infiniband/core/nldev.c  | 134 +++++++++++++++++++++++++++++++++++++++
>>>>  include/rdma/ib_verbs.h          |   2 +
>>>>  include/rdma/rdma_netlink.h      |  13 ++++
>>>>  include/uapi/rdma/rdma_netlink.h |  11 +++-
>>>>  4 files changed, 158 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
>>>> index 9abbadb9e366..b6d97c592074 100644
>>>> +++ b/drivers/infiniband/core/nldev.c
>>>> @@ -33,6 +33,7 @@
>>>>  #include <linux/module.h>
>>>>  #include <linux/pid.h>
>>>>  #include <linux/pid_namespace.h>
>>>> +#include <linux/mutex.h>
>>>>  #include <net/netlink.h>
>>>>  #include <rdma/rdma_cm.h>
>>>>  #include <rdma/rdma_netlink.h>
>>>> @@ -107,6 +108,8 @@
>>>>  	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
>>>>  	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
>>>>  	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
>>>> +	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
>>>> +				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
>>>>  };
>>>>  
>>>>  static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
>>>> @@ -1104,6 +1107,129 @@ static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
>>>>  	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
>>>>  }
>>>>  
>>>> +static LIST_HEAD(link_ops);
>>>> +static DECLARE_RWSEM(link_ops_rwsem);
>>>> +
>>>> +static const struct rdma_link_ops *link_ops_get(const char *type)
>>>> +{
>>>> +	const struct rdma_link_ops *ops;
>>>> +
>>>> +	list_for_each_entry(ops, &link_ops, list) {
>>>> +		if (!strcmp(ops->type, type))
>>>> +			goto out;
>>>> +	}
>>>> +	ops = NULL;
>>>> +out:
>>>> +	return ops;
>>>> +}
>>>> +
>>>> +void rdma_link_register(struct rdma_link_ops *ops)
>>>> +{
>>>> +	down_write(&link_ops_rwsem);
>>>> +	if (link_ops_get(ops->type)) {
>>>> +		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
>>>> +		goto out;
>>>> +	}
>>>> +	list_add(&ops->list, &link_ops);
>>>> +out:
>>>> +	up_write(&link_ops_rwsem);
>>>> +}
>>>> +EXPORT_SYMBOL(rdma_link_register);
>>>> +
>>>> +void rdma_link_unregister(struct rdma_link_ops *ops)
>>>> +{
>>>> +	down_write(&link_ops_rwsem);
>>>> +	list_del(&ops->list);
>>>> +	up_write(&link_ops_rwsem);
>>>> +}
>>>> +EXPORT_SYMBOL(rdma_link_unregister);
>>>> +
>>>> +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
>>>> +			  struct netlink_ext_ack *extack)
>>>> +{
>>>> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>>>> +	char ibdev_name[IB_DEVICE_NAME_MAX];
>>>> +	const struct rdma_link_ops *ops;
>>>> +	struct ib_device *device;
>>>> +	char ndev_name[IFNAMSIZ];
>>>> +	char type[IFNAMSIZ];
>>>> +	int err;
>>>> +
>>>> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>>>> +			  nldev_policy, extack);
>>>> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
>>>> +	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
>>>> +		return -EINVAL;
>>>> +
>>>> +	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
>>>> +		    sizeof(ibdev_name));
>>>> +	if (strchr(ibdev_name, '%'))
>>>> +		return -EINVAL;
>>>> +
>>>> +	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
>>>> +	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
>>>> +		    sizeof(ndev_name));
>>>> +
>>>> +	down_read(&link_ops_rwsem);
>>>> +	ops = link_ops_get(type);
>>>> +#ifdef CONFIG_MODULES
>>>> +	if (!ops) {
>>>> +		up_read(&link_ops_rwsem);
>>>> +		request_module("rdma-link-%s", type);
>>>> +		down_read(&link_ops_rwsem);
>>>> +		ops = link_ops_get(type);
>>>> +	}
>>>> +#endif
>>>> +	if (ops) {
>>>> +		device = ops->newlink(ibdev_name, ndev_name);
>>>> +		if (IS_ERR(device))
>>>> +			err = PTR_ERR(device);
>>>> +		else
>>>> +			device->link_ops = ops;
>>>> +	} else {
>>>> +		err = -ENODEV;
>>>> +	}
>>>> +	up_read(&link_ops_rwsem);
>>>> +
>>>> +	return err;
>>>> +}
>>>> +
>>>> +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
>>>> +			  struct netlink_ext_ack *extack)
>>>> +{
>>>> +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
>>>> +	const struct rdma_link_ops *ops;
>>>> +	struct device *dma_device;
>>>> +	struct ib_device *device;
>>>> +	u32 index;
>>>> +	int err;
>>>> +
>>>> +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
>>>> +			  nldev_policy, extack);
>>>> +	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
>>>> +		return -EINVAL;
>>>> +
>>>> +	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
>>>> +	device = ib_device_get_by_index(index);
>>>> +	if (!device)
>>>> +		return -EINVAL;
>>>> +
>>>> +	ops = device->link_ops;
>>>> +
>>>> +	/*
>>>> +	 * Deref the ib_device before deleting it.  Otherwise we
>>>> +	 * deadlock unregistering the device.  Hold a ref on the
>>>> +	 * underlying dma_device though to keep the memory around
>>>> +	 * until we're done.
>>>> +	 */
>>>> +	dma_device = get_device(device->dma_device);
>>>> +	ib_device_put(device);
>>>> +	err = ops ? ops->dellink(device) : -ENODEV;
>>>> +	put_device(dma_device);
>> What is the weirndness with dma_device? It should just be device
> Do you mean ibdev->dev?
>
>
>>> There is still a problem with the above code.  If thread 1 is running
>>> this code and gets up to just after ib_device_put() but before calling
>>> ops->dellink, and then thread2 runs the rmmod code of rxe, thread 2 can
>>> remove the device and complete the module unload.
>> Yes.. You can't use a lot of  the ib device outside the
>> ib_device_get_by_index/ib_device_put critical region.
>>
>> The driver, or perhaps the core code, has to be the one to unput it,
>> and the unput has to be done under proper locking to resolve the race.
> We could define that the link_ops dellink function must do the final
> ib_device_put().  That would work...

This requires exporting ib_device_put() for drivers to use. 

Jason, This  feature took a step back, imo, by using
ib_device_get_by_index() in nldev_dellink() and passing the struct
ib_device ptr vs simply passing the ibdev name string to the driver's
dellink() function and letting the driver deal with it all.  I'm
inclined to go back to that approach.  I don't see things getting
cleaner and simpler on this current path. 

Do you?
Jason Gunthorpe Dec. 6, 2018, 10:21 p.m. UTC | #8
On Thu, Dec 06, 2018 at 03:16:57PM -0600, Steve Wise wrote:
> Jason, This  feature took a step back, imo, by using
> ib_device_get_by_index() in nldev_dellink() and passing the struct
> ib_device ptr vs simply passing the ibdev name string to the driver's
> dellink() function and letting the driver deal with it all.  I'm
> inclined to go back to that approach.  I don't see things getting
> cleaner and simpler on this current path. 

It doesn't solve anything to have the driver do the lookup, it still
has to have proper locking and prevent the races. If it can do that,
then it can do it with an already looked-up device just the same.

I'm still wondering if there is some core functions we could create to
handle this common case for dellink drivers.

Jason
Steve Wise Dec. 6, 2018, 10:58 p.m. UTC | #9
On 12/6/2018 4:21 PM, Jason Gunthorpe wrote:
> On Thu, Dec 06, 2018 at 03:16:57PM -0600, Steve Wise wrote:
>> Jason, This  feature took a step back, imo, by using
>> ib_device_get_by_index() in nldev_dellink() and passing the struct
>> ib_device ptr vs simply passing the ibdev name string to the driver's
>> dellink() function and letting the driver deal with it all.  I'm
>> inclined to go back to that approach.  I don't see things getting
>> cleaner and simpler on this current path. 
> It doesn't solve anything to have the driver do the lookup, it still
> has to have proper locking and prevent the races. If it can do that,
> then it can do it with an already looked-up device just the same.

It is easier by just passing the name to the driver because the driver
can easily determine if the device has been removed via a
driver-specific lock and lookup.  Looking up the device via
ib_device_get_by_index() uses a core-private mechanism that isn't public
to the driver, so it makes life more difficult.  I could export
ib_device_put() but that doesn't seem clean to me.

> I'm still wondering if there is some core functions we could create to
> handle this common case for dellink drivers.

Please noodle on it - I could use your help.  Thanks...
Jason Gunthorpe Dec. 6, 2018, 11:45 p.m. UTC | #10
On Thu, Dec 06, 2018 at 04:58:16PM -0600, Steve Wise wrote:
> 
> On 12/6/2018 4:21 PM, Jason Gunthorpe wrote:
> > On Thu, Dec 06, 2018 at 03:16:57PM -0600, Steve Wise wrote:
> >> Jason, This  feature took a step back, imo, by using
> >> ib_device_get_by_index() in nldev_dellink() and passing the struct
> >> ib_device ptr vs simply passing the ibdev name string to the driver's
> >> dellink() function and letting the driver deal with it all.  I'm
> >> inclined to go back to that approach.  I don't see things getting
> >> cleaner and simpler on this current path. 
> > It doesn't solve anything to have the driver do the lookup, it still
> > has to have proper locking and prevent the races. If it can do that,
> > then it can do it with an already looked-up device just the same.
> 
> It is easier by just passing the name to the driver because the driver
> can easily determine if the device has been removed via a
> driver-specific lock and lookup.  Looking up the device via
> ib_device_get_by_index() uses a core-private mechanism that isn't public
> to the driver, so it makes life more difficult.  I could export
> ib_device_put() but that doesn't seem clean to me.
> 
> > I'm still wondering if there is some core functions we could create to
> > handle this common case for dellink drivers.
> 
> Please noodle on it - I could use your help.  Thanks...

Maybe the simplest is to have dellink return 0 and then the core code
directly does the unregister. Then the core code can handle the
special locking with internal locks.

To make this work we probably need to add a 'close' callback into
ib_device so that the core can sequence destruction properly - ie
everything that happens in the driver after unregister should be done
in close.

This is sort of what netdev does.

Jason
Steve Wise Dec. 7, 2018, 2:23 p.m. UTC | #11
> On Thu, Dec 06, 2018 at 03:16:57PM -0600, Steve Wise wrote:
> > Jason, This  feature took a step back, imo, by using
> > ib_device_get_by_index() in nldev_dellink() and passing the struct
> > ib_device ptr vs simply passing the ibdev name string to the driver's
> > dellink() function and letting the driver deal with it all.  I'm
> > inclined to go back to that approach.  I don't see things getting
> > cleaner and simpler on this current path.
> 
> It doesn't solve anything to have the driver do the lookup, it still
> has to have proper locking and prevent the races. If it can do that,
> then it can do it with an already looked-up device just the same.
>

It can solve it because the driver has its own list of "active devices" and can use that to atomically remove it and deregister the ib device when everyone is done touching the driver struct.   

Using ib_device_get_by_index() just causes more headaches in the dellink path IMO...
 
> I'm still wondering if there is some core functions we could create to
> handle this common case for dellink drivers.
> 
> Jason
Steve Wise Dec. 7, 2018, 2:23 p.m. UTC | #12
> > >> Jason, This  feature took a step back, imo, by using
> > >> ib_device_get_by_index() in nldev_dellink() and passing the struct
> > >> ib_device ptr vs simply passing the ibdev name string to the driver's
> > >> dellink() function and letting the driver deal with it all.  I'm
> > >> inclined to go back to that approach.  I don't see things getting
> > >> cleaner and simpler on this current path.
> > > It doesn't solve anything to have the driver do the lookup, it still
> > > has to have proper locking and prevent the races. If it can do that,
> > > then it can do it with an already looked-up device just the same.
> >
> > It is easier by just passing the name to the driver because the driver
> > can easily determine if the device has been removed via a
> > driver-specific lock and lookup.  Looking up the device via
> > ib_device_get_by_index() uses a core-private mechanism that isn't public
> > to the driver, so it makes life more difficult.  I could export
> > ib_device_put() but that doesn't seem clean to me.
> >
> > > I'm still wondering if there is some core functions we could create to
> > > handle this common case for dellink drivers.
> >
> > Please noodle on it - I could use your help.  Thanks...
> 
> Maybe the simplest is to have dellink return 0 and then the core code
> directly does the unregister. Then the core code can handle the
> special locking with internal locks.
> 
> To make this work we probably need to add a 'close' callback into
> ib_device so that the core can sequence destruction properly - ie
> everything that happens in the driver after unregister should be done
> in close.
> 
> This is sort of what netdev does.

This approach results in ib device unregister done by the driver for cases like NETDEV_UNREGISTER and module unload, and ib_device unregister done by the core for just DELLINK.  I don't like that.

Also, the RXE unregister logic would become more complicated because the "last man out must call ib_device_unregister()" is now dependent on which path is doing this final device remove...
Steve Wise Dec. 7, 2018, 4:12 p.m. UTC | #13
> 
> > > >> Jason, This  feature took a step back, imo, by using
> > > >> ib_device_get_by_index() in nldev_dellink() and passing the struct
> > > >> ib_device ptr vs simply passing the ibdev name string to the driver's
> > > >> dellink() function and letting the driver deal with it all.  I'm
> > > >> inclined to go back to that approach.  I don't see things getting
> > > >> cleaner and simpler on this current path.
> > > > It doesn't solve anything to have the driver do the lookup, it still
> > > > has to have proper locking and prevent the races. If it can do that,
> > > > then it can do it with an already looked-up device just the same.
> > >
> > > It is easier by just passing the name to the driver because the driver
> > > can easily determine if the device has been removed via a
> > > driver-specific lock and lookup.  Looking up the device via
> > > ib_device_get_by_index() uses a core-private mechanism that isn't public
> > > to the driver, so it makes life more difficult.  I could export
> > > ib_device_put() but that doesn't seem clean to me.
> > >
> > > > I'm still wondering if there is some core functions we could create to
> > > > handle this common case for dellink drivers.
> > >
> > > Please noodle on it - I could use your help.  Thanks...
> >
> > Maybe the simplest is to have dellink return 0 and then the core code
> > directly does the unregister. Then the core code can handle the
> > special locking with internal locks.
> >
> > To make this work we probably need to add a 'close' callback into
> > ib_device so that the core can sequence destruction properly - ie
> > everything that happens in the driver after unregister should be done
> > in close.
> >
> > This is sort of what netdev does.
> 
> This approach results in ib device unregister done by the driver for cases like
> NETDEV_UNREGISTER and module unload, and ib_device unregister done by
> the core for just DELLINK.  I don't like that.
> 
> Also, the RXE unregister logic would become more complicated because the
> "last man out must call ib_device_unregister()" is now dependent on which
> path is doing this final device remove...
> 
> 

Here is a way to ensure the module doesn't unload.  With this and my current rxe changes, I think the races are all handled.

...

        device = ib_device_get_by_index(index);
        if (!device)
                return -EINVAL;

        ops = device->link_ops;
        if (!ops) {
                ib_device_put(device);
                return -EINVAL;
        }

        /*
         * Deref the ib_device before deleting it.  Otherwise we
         * deadlock unregistering the device.  Hold a ref on the
         * underlying device to keep the memory around until we're
         * done.  If the module owning the device is unloading,
         * then the driver will delete the device during unload.
         */
        dev = get_device(&device->dev);
        owner = device->owner;
        removing = try_module_get(owner);
        ib_device_put(device);

        if (!removing) {
                ops->dellink(device);
                module_put(owner);
        }
        put_device(&device->dev);
        return err;

...
Jason Gunthorpe Dec. 7, 2018, 7:25 p.m. UTC | #14
On Fri, Dec 07, 2018 at 10:12:44AM -0600, Steve Wise wrote:

> Here is a way to ensure the module doesn't unload.  With this and my
> current rxe changes, I think the races are all handled.

I don't like messing with module refcount to solve a locking problem..

Here something like this compile-tested thing..

Instead of implementing dellink as a function call implement it as a
flag 'ALLOW_USER_UNREGISTER'. The core code simply calls
ib_unregister_device_and_put() upon dellink.

rxe was full of lifetime bugs in this area, I think this fixes alot of
them.. It probably replaces that other patch trying to do the same.

From 202ce0c783f0a2022ad49f5154be801b1d2a7b32 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 7 Dec 2018 10:30:52 -0700
Subject: [PATCH] verbs/rxe: Use core services to keep track of RXE things

Replace
 - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
 - rxe_dev_list with core's device_list.
 - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
   lifetime management
 - Buggy get_rxe_by_name with ib_device_get_by_name

Add
 - ib_unregister_driver so drivers like rxe don't have to keep track
   of things to do module unload
 - ib_unregister_device_and_put to work with potiners returned by
   that various ib_device_get_* functions

The refcount change to the pool might need more work, but fundamentally
the driver cannot have krefs open in the pool across unregistration or it
contains module unlolad races.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/core_priv.h   |   1 -
 drivers/infiniband/core/device.c      | 210 ++++++++++++++++++++++++--
 drivers/infiniband/sw/rxe/rxe.c       |  38 +----
 drivers/infiniband/sw/rxe/rxe.h       |  14 +-
 drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
 drivers/infiniband/sw/rxe/rxe_net.c   |  82 +++-------
 drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
 drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 ++---
 drivers/infiniband/sw/rxe/rxe_verbs.c |   7 -
 drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
 include/rdma/ib_verbs.h               |  10 ++
 11 files changed, 257 insertions(+), 153 deletions(-)

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index cc7535c5e19233..1b2575430032d0 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
 #endif
 
 struct ib_device *ib_device_get_by_index(u32 ifindex);
-void ib_device_put(struct ib_device *device);
 /* RDMA device netlink */
 void nldev_init(void);
 void nldev_exit(void);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 0027b0d79b09e5..ec60ea3429b024 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -85,6 +85,9 @@ static LIST_HEAD(client_list);
 static DEFINE_MUTEX(device_mutex);
 static DECLARE_RWSEM(lists_rwsem);
 
+/* Used to serialize ib_unregister_driver */
+static DECLARE_RWSEM(unregister_rwsem);
+
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
 			      void *lsm_data);
 static void ib_policy_change_task(struct work_struct *work);
@@ -168,6 +171,7 @@ void ib_device_put(struct ib_device *device)
 	if (refcount_dec_and_test(&device->refcount))
 		complete(&device->unreg_completion);
 }
+EXPORT_SYMBOL(ib_device_put);
 
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
@@ -180,6 +184,27 @@ static struct ib_device *__ib_device_get_by_name(const char *name)
 	return NULL;
 }
 
+struct ib_device *ib_device_get_by_name(const char *name,
+					enum rdma_driver_id driver_id)
+{
+	struct ib_device *device;
+
+	down_read(&lists_rwsem);
+	device = __ib_device_get_by_name(name);
+	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
+	    device->driver_id != driver_id)
+		device = NULL;
+
+	if (device) {
+		/* Do not return a device if unregistration has started. */
+		if (!refcount_inc_not_zero(&device->refcount))
+			device = NULL;
+	}
+	up_read(&lists_rwsem);
+	return device;
+}
+EXPORT_SYMBOL(ib_device_get_by_name);
+
 int ib_device_rename(struct ib_device *ibdev, const char *name)
 {
 	struct ib_device *device;
@@ -641,28 +666,41 @@ int ib_register_device(struct ib_device *device, const char *name,
 }
 EXPORT_SYMBOL(ib_register_device);
 
-/**
- * ib_unregister_device - Unregister an IB device
- * @device:Device to unregister
+/* Returns false if the device is already undergoing unregistration in another
+ * thread, and the device pointer may be invalid upon return.
  *
- * Unregister an IB device.  All clients will receive a remove callback.
+ * If true is returned then the caller owns a kref associated with the device
+ * (taken from the kref owned by the list).
+ *
+ * Upon return the device cannot be returned by any 'get' functions.
  */
-void ib_unregister_device(struct ib_device *device)
+static bool ib_pre_unregister(struct ib_device *device)
+{
+	bool already_removed;
+
+	mutex_lock(&device_mutex);
+	down_write(&lists_rwsem);
+	already_removed = list_empty(&device->core_list);
+	list_del_init(&device->core_list);
+	up_write(&lists_rwsem);
+	mutex_unlock(&device_mutex);
+
+	/* Matches the recound_set in ib_alloc_device */
+	ib_device_put(device);
+
+	return !already_removed;
+}
+
+static void __ib_unregister_device(struct ib_device *device)
 {
 	struct ib_client_data *context, *tmp;
 	unsigned long flags;
 
-	/*
-	 * Wait for all netlink command callers to finish working on the
-	 * device.
-	 */
-	ib_device_put(device);
 	wait_for_completion(&device->unreg_completion);
 
 	mutex_lock(&device_mutex);
 
 	down_write(&lists_rwsem);
-	list_del(&device->core_list);
 	write_lock_irq(&device->client_data_lock);
 	list_for_each_entry(context, &device->client_data_list, list)
 		context->going_down = true;
@@ -696,9 +734,115 @@ void ib_unregister_device(struct ib_device *device)
 	up_write(&lists_rwsem);
 
 	device->reg_state = IB_DEV_UNREGISTERED;
+
+	/*
+	 * Drivers using the new flow may not call ib_dealloc_device except
+	 * in error unwind prior to registration success.
+	 */
+	if (device->driver_unregister) {
+		device->driver_unregister(device);
+		ib_dealloc_device(device);
+	}
+}
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device: The device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ *
+ * Callers can call this routine only once, and must protect against
+ * races. Typically it should only be called as part of a remove callback in
+ * an implementation of driver core's struct device_driver and related.
+ *
+ * A driver must choose to use either only ib_unregister_device or
+ * ib_unregister_device_and_put & ib_unregister_driver.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+	down_read(&unregister_rwsem);
+	if (ib_pre_unregister(device))
+		__ib_unregister_device(device);
+	else
+		WARN(true, "Driver mixing ib_unregister_device with _and_put");
+	up_read(&unregister_rwsem);
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
+/**
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
+ * device: The device to unregister
+ *
+ * This is the same as ib_unregister_device(), except it includes an internal
+ * ib_device_put() that should match a 'get' obtained by the caller.
+ *
+ * It is safe to call this routine concurrently from multiple threads while
+ * holding the 'get', however in this case return from the function does not
+ * guarantee the device is destroyed, only that destruction is in-progress.
+ *
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
+ * their resources associated with the device and dealloc it.
+ */
+void ib_unregister_device_and_put(struct ib_device *device)
+{
+	down_read(&unregister_rwsem);
+	if (ib_pre_unregister(device)) {
+		/*
+		 * Caller's get can be returned now that we hold the kref,
+		 * otherwise ib_pre_unregister returned the caller's get
+		 */
+		ib_device_put(device);
+		__ib_unregister_device(device);
+	}
+	up_read(&unregister_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_device_and_put);
+
+/**
+ * ib_unregister_driver - Unregister all IB devices for a driver
+ * @driver_id: The driver to unregister
+ *
+ * This implements a fence for device unregistration. It only returns once all
+ * devices associated with the driver_id have fully completed their
+ * unregistration and returned from ib_unregister_device*().
+ *
+ * If device's are not yet unregistered it goes ahead and starts unregistering
+ * them.
+ *
+ * The caller must ensure that no devices can be be added while this is
+ * running (or at all for the module_unload case)
+ */
+void ib_unregister_driver(enum rdma_driver_id driver_id)
+{
+	struct ib_device *device;
+	LIST_HEAD(driver_devs);
+
+	down_write(&unregister_rwsem);
+	mutex_lock(&device_mutex);
+	down_write(&lists_rwsem);
+	list_for_each_entry(device, &device_list, core_list) {
+		if (device->driver_id == driver_id)
+			list_move(&device->core_list, &driver_devs);
+	}
+	up_write(&lists_rwsem);
+
+	while ((device = list_first_entry_or_null(
+			&driver_devs, struct ib_device, core_list))) {
+		mutex_unlock(&device_mutex);
+		/*
+		 * We are holding the unregister_rwsem, so it is impossible
+		 * for another thread to be doing registration.
+		 */
+		if (!ib_pre_unregister(device))
+			WARN(true, "Impossible failure of ib_pre_unregister");
+		__ib_unregister_device(device);
+		mutex_lock(&device_mutex);
+	}
+	mutex_unlock(&device_mutex);
+	up_write(&unregister_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_driver);
+
 /**
  * ib_register_client - Register an IB client
  * @client:Client to register
@@ -981,6 +1125,50 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
 		}
 }
 
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id)
+{
+	struct ib_device *ib_dev;
+
+	down_read(&lists_rwsem);
+	list_for_each_entry(ib_dev, &device_list, core_list) {
+		unsigned int port;
+
+		if (driver_id != RDMA_DRIVER_UNKNOWN &&
+		    ib_dev->driver_id != driver_id)
+			continue;
+
+		if (!ib_dev->get_netdev)
+			continue;
+
+		for (port = rdma_start_port(ib_dev);
+		     port <= rdma_end_port(ib_dev); port++) {
+			struct net_device *idev;
+
+			if (!rdma_protocol_roce(ib_dev, port))
+				continue;
+
+			idev = ib_dev->get_netdev(ib_dev, port);
+			if (idev != ndev)
+				continue;
+
+			/*
+			 * Do not return a device if unregistration has
+			 * started.
+			 */
+			if (!refcount_inc_not_zero(&ib_dev->refcount))
+				continue;
+
+			up_read(&lists_rwsem);
+			return ib_dev;
+		}
+	}
+	up_read(&lists_rwsem);
+
+	return NULL;
+}
+EXPORT_SYMBOL(ib_device_get_by_netdev);
+
 /**
  * ib_enum_all_roce_netdevs - enumerate all RoCE devices
  * @filter: Should we call the callback?
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 383e65c7bbc057..4069463f9a6641 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -50,8 +50,10 @@ static void rxe_cleanup_ports(struct rxe_dev *rxe)
 /* free resources for a rxe device all objects created for this device must
  * have been destroyed
  */
-static void rxe_cleanup(struct rxe_dev *rxe)
+void rxe_unregistered(struct ib_device *ib_dev)
 {
+	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
 	rxe_pool_cleanup(&rxe->uc_pool);
 	rxe_pool_cleanup(&rxe->pd_pool);
 	rxe_pool_cleanup(&rxe->ah_pool);
@@ -68,15 +70,6 @@ static void rxe_cleanup(struct rxe_dev *rxe)
 	crypto_free_shash(rxe->tfm);
 }
 
-/* called when all references have been dropped */
-void rxe_release(struct kref *kref)
-{
-	struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
-
-	rxe_cleanup(rxe);
-	ib_dealloc_device(&rxe->ib_dev);
-}
-
 /* initialize rxe device parameters */
 static void rxe_init_device_param(struct rxe_dev *rxe)
 {
@@ -279,7 +272,6 @@ static int rxe_init(struct rxe_dev *rxe)
 	spin_lock_init(&rxe->mmap_offset_lock);
 	spin_lock_init(&rxe->pending_lock);
 	INIT_LIST_HEAD(&rxe->pending_mmaps);
-	INIT_LIST_HEAD(&rxe->list);
 
 	mutex_init(&rxe->usdev_lock);
 
@@ -312,31 +304,13 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
 {
 	int err;
 
-	kref_init(&rxe->ref_cnt);
-
 	err = rxe_init(rxe);
 	if (err)
-		goto err1;
+		return err;
 
 	rxe_set_mtu(rxe, mtu);
 
-	err = rxe_register_device(rxe);
-	if (err)
-		goto err1;
-
-	return 0;
-
-err1:
-	rxe_dev_put(rxe);
-	return err;
-}
-
-/* called by the ifc layer to remove a device */
-void rxe_remove(struct rxe_dev *rxe)
-{
-	rxe_unregister_device(rxe);
-
-	rxe_dev_put(rxe);
+	return rxe_register_device(rxe);
 }
 
 static int __init rxe_module_init(void)
@@ -360,7 +334,7 @@ static int __init rxe_module_init(void)
 
 static void __exit rxe_module_exit(void)
 {
-	rxe_remove_all();
+	ib_unregister_driver(RDMA_DRIVER_RXE);
 	rxe_net_exit();
 	rxe_cache_exit();
 
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
index 8f79bd86d0337f..75e7113a756d22 100644
--- a/drivers/infiniband/sw/rxe/rxe.h
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -96,17 +96,19 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
 void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
 
 int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
-void rxe_remove(struct rxe_dev *rxe);
-void rxe_remove_all(void);
 
 void rxe_rcv(struct sk_buff *skb);
 
-static inline void rxe_dev_put(struct rxe_dev *rxe)
+/* The caller must do a matching ib_device_put(&dev->ib_dev) */
+static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
 {
-	kref_put(&rxe->ref_cnt, rxe_release);
+	struct ib_device *ibdev =
+		ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
+
+	if (!ibdev)
+		return NULL;
+	return container_of(ibdev, struct rxe_dev, ib_dev);
 }
-struct rxe_dev *net_to_rxe(struct net_device *ndev);
-struct rxe_dev *get_rxe_by_name(const char *name);
 
 void rxe_port_up(struct rxe_dev *rxe);
 void rxe_port_down(struct rxe_dev *rxe);
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index a675c9f2b42783..e830d56a15c2a4 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -231,7 +231,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
 		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
 		      struct rxe_modify_srq_cmd *ucmd);
 
-void rxe_release(struct kref *kref);
+void rxe_unregistered(struct ib_device *ib_dev);
 
 int rxe_completer(void *arg);
 int rxe_requester(void *arg);
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index b26a8141f3edcb..d1f5959d9013a5 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -45,43 +45,6 @@
 #include "rxe_net.h"
 #include "rxe_loc.h"
 
-static LIST_HEAD(rxe_dev_list);
-static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
-
-struct rxe_dev *net_to_rxe(struct net_device *ndev)
-{
-	struct rxe_dev *rxe;
-	struct rxe_dev *found = NULL;
-
-	spin_lock_bh(&dev_list_lock);
-	list_for_each_entry(rxe, &rxe_dev_list, list) {
-		if (rxe->ndev == ndev) {
-			found = rxe;
-			break;
-		}
-	}
-	spin_unlock_bh(&dev_list_lock);
-
-	return found;
-}
-
-struct rxe_dev *get_rxe_by_name(const char *name)
-{
-	struct rxe_dev *rxe;
-	struct rxe_dev *found = NULL;
-
-	spin_lock_bh(&dev_list_lock);
-	list_for_each_entry(rxe, &rxe_dev_list, list) {
-		if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) {
-			found = rxe;
-			break;
-		}
-	}
-	spin_unlock_bh(&dev_list_lock);
-	return found;
-}
-
-
 static struct rxe_recv_sockets recv_sockets;
 
 struct device *rxe_dma_device(struct rxe_dev *rxe)
@@ -229,12 +192,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	struct udphdr *udph;
 	struct net_device *ndev = skb->dev;
 	struct net_device *rdev = ndev;
-	struct rxe_dev *rxe = net_to_rxe(ndev);
+	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
 	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
 
 	if (!rxe && is_vlan_dev(rdev)) {
 		rdev = vlan_dev_real_dev(ndev);
-		rxe = net_to_rxe(rdev);
+		rxe = rxe_get_dev_from_net(rdev);
 	}
 	if (!rxe)
 		goto drop;
@@ -253,6 +216,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 
 	rxe_rcv(skb);
 
+	/*
+	 * FIXME: this is in the wrong place, it needs to be done when pkt is
+	 * destroyed
+	 */
+	ib_device_put(&rxe->ib_dev);
+
 	return 0;
 drop:
 	kfree_skb(skb);
@@ -558,36 +527,18 @@ struct rxe_dev *rxe_net_add(struct net_device *ndev)
 	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
 	if (!rxe)
 		return NULL;
-
+	rxe->ib_dev.driver_unregister = rxe_unregistered;
 	rxe->ndev = ndev;
 
 	err = rxe_add(rxe, ndev->mtu);
 	if (err) {
-		ib_dealloc_device(&rxe->ib_dev);
+		rxe_unregistered(&rxe->ib_dev);
 		return NULL;
 	}
 
-	spin_lock_bh(&dev_list_lock);
-	list_add_tail(&rxe->list, &rxe_dev_list);
-	spin_unlock_bh(&dev_list_lock);
 	return rxe;
 }
 
-void rxe_remove_all(void)
-{
-	spin_lock_bh(&dev_list_lock);
-	while (!list_empty(&rxe_dev_list)) {
-		struct rxe_dev *rxe =
-			list_first_entry(&rxe_dev_list, struct rxe_dev, list);
-
-		list_del(&rxe->list);
-		spin_unlock_bh(&dev_list_lock);
-		rxe_remove(rxe);
-		spin_lock_bh(&dev_list_lock);
-	}
-	spin_unlock_bh(&dev_list_lock);
-}
-
 static void rxe_port_event(struct rxe_dev *rxe,
 			   enum ib_event_type event)
 {
@@ -630,16 +581,16 @@ static int rxe_notify(struct notifier_block *not_blk,
 		      void *arg)
 {
 	struct net_device *ndev = netdev_notifier_info_to_dev(arg);
-	struct rxe_dev *rxe = net_to_rxe(ndev);
+	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
 
 	if (!rxe)
-		goto out;
+		return NOTIFY_OK;
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
-		list_del(&rxe->list);
-		rxe_remove(rxe);
-		break;
+		ib_unregister_device_and_put(&rxe->ib_dev);
+		return NOTIFY_OK;
+
 	case NETDEV_UP:
 		rxe_port_up(rxe);
 		break;
@@ -666,7 +617,8 @@ static int rxe_notify(struct notifier_block *not_blk,
 			event, ndev->name);
 		break;
 	}
-out:
+
+	ib_device_put(&rxe->ib_dev);
 	return NOTIFY_OK;
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index a04a076e03afbd..bb7e3268713ed6 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -390,8 +390,6 @@ void *rxe_alloc(struct rxe_pool *pool)
 	kref_get(&pool->ref_cnt);
 	read_unlock_irqrestore(&pool->pool_lock, flags);
 
-	kref_get(&pool->rxe->ref_cnt);
-
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
 		goto out_put_pool;
 
@@ -408,7 +406,6 @@ void *rxe_alloc(struct rxe_pool *pool)
 
 out_put_pool:
 	atomic_dec(&pool->num_elem);
-	rxe_dev_put(pool->rxe);
 	rxe_pool_put(pool);
 	return NULL;
 }
@@ -424,7 +421,6 @@ void rxe_elem_release(struct kref *kref)
 
 	kmem_cache_free(pool_cache(pool), elem);
 	atomic_dec(&pool->num_elem);
-	rxe_dev_put(pool->rxe);
 	rxe_pool_put(pool);
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
index 73a19f808e1bab..87c17675cf9d9e 100644
--- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -53,20 +53,14 @@ static int sanitize_arg(const char *val, char *intf, int intf_len)
 	return len;
 }
 
-static void rxe_set_port_state(struct net_device *ndev)
+static void rxe_set_port_state(struct rxe_dev *rxe, struct net_device *ndev)
 {
-	struct rxe_dev *rxe = net_to_rxe(ndev);
 	bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
 
-	if (!rxe)
-		goto out;
-
 	if (is_up)
 		rxe_port_up(rxe);
 	else
 		rxe_port_down(rxe); /* down for unknown state */
-out:
-	return;
 }
 
 static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
@@ -74,24 +68,25 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
 	int len;
 	int err = 0;
 	char intf[32];
-	struct net_device *ndev = NULL;
+	struct net_device *ndev;
+	struct rxe_dev *exists;
 	struct rxe_dev *rxe;
 
 	len = sanitize_arg(val, intf, sizeof(intf));
 	if (!len) {
 		pr_err("add: invalid interface name\n");
-		err = -EINVAL;
-		goto err;
+		return -EINVAL;
 	}
 
 	ndev = dev_get_by_name(&init_net, intf);
 	if (!ndev) {
 		pr_err("interface %s not found\n", intf);
-		err = -EINVAL;
-		goto err;
+		return -EINVAL;
 	}
 
-	if (net_to_rxe(ndev)) {
+	exists = rxe_get_dev_from_net(ndev);
+	if (exists) {
+		ib_device_put(&exists->ib_dev);
 		pr_err("already configured on %s\n", intf);
 		err = -EINVAL;
 		goto err;
@@ -104,11 +99,11 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
 		goto err;
 	}
 
-	rxe_set_port_state(ndev);
+	rxe_set_port_state(rxe, ndev);
 	dev_info(&rxe->ib_dev.dev, "added %s\n", intf);
+
 err:
-	if (ndev)
-		dev_put(ndev);
+	dev_put(ndev);
 	return err;
 }
 
@@ -116,7 +111,7 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
 {
 	int len;
 	char intf[32];
-	struct rxe_dev *rxe;
+	struct ib_device *ib_dev;
 
 	len = sanitize_arg(val, intf, sizeof(intf));
 	if (!len) {
@@ -126,20 +121,18 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
 
 	if (strncmp("all", intf, len) == 0) {
 		pr_info("rxe_sys: remove all");
-		rxe_remove_all();
+		ib_unregister_driver(RDMA_DRIVER_RXE);
 		return 0;
 	}
 
-	rxe = get_rxe_by_name(intf);
+	ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE);
 
-	if (!rxe) {
+	if (!ib_dev) {
 		pr_err("not configured on %s\n", intf);
 		return -EINVAL;
 	}
 
-	list_del(&rxe->list);
-	rxe_remove(rxe);
-
+	ib_unregister_device_and_put(ib_dev);
 	return 0;
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 30817c79ba962f..28beee8f583823 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1286,10 +1286,3 @@ int rxe_register_device(struct rxe_dev *rxe)
 
 	return err;
 }
-
-void rxe_unregister_device(struct rxe_dev *rxe)
-{
-	struct ib_device *dev = &rxe->ib_dev;
-
-	ib_unregister_device(dev);
-}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 831381b7788da9..a0b63581278420 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -385,7 +385,6 @@ struct rxe_dev {
 	struct ib_device_attr	attr;
 	int			max_ucontext;
 	int			max_inline_data;
-	struct kref		ref_cnt;
 	struct mutex	usdev_lock;
 
 	struct net_device	*ndev;
@@ -412,7 +411,6 @@ struct rxe_dev {
 	u64			stats_counters[RXE_NUM_OF_COUNTERS];
 
 	struct rxe_port		port;
-	struct list_head	list;
 	struct crypto_shash	*tfm;
 };
 
@@ -467,7 +465,6 @@ static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
 }
 
 int rxe_register_device(struct rxe_dev *rxe);
-void rxe_unregister_device(struct rxe_dev *rxe);
 
 void rxe_mc_cleanup(struct rxe_pool_entry *arg);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 85021451eee073..5fbeedacb3aa88 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2534,6 +2534,9 @@ struct ib_device {
 				 struct ib_counters_read_attr *counters_read_attr,
 				 struct uverbs_attr_bundle *attrs);
 
+	/* Called after all clients have been destroyed*/
+	void (*driver_unregister)(struct ib_device *dev);
+
 	/**
 	 * rdma netdev operation
 	 *
@@ -2653,6 +2656,8 @@ int ib_register_device(struct ib_device *device, const char *name,
 		       int (*port_callback)(struct ib_device *, u8,
 					    struct kobject *));
 void ib_unregister_device(struct ib_device *device);
+void ib_unregister_driver(enum rdma_driver_id driver_id);
+void ib_unregister_device_and_put(struct ib_device *device);
 
 int ib_register_client   (struct ib_client *client);
 void ib_unregister_client(struct ib_client *client);
@@ -3937,6 +3942,11 @@ static inline bool ib_access_writable(int access_flags)
 int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 		       struct ib_mr_status *mr_status);
 
+void ib_device_put(struct ib_device *device);
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id);
+struct ib_device *ib_device_get_by_name(const char *name,
+					enum rdma_driver_id driver_id);
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
 					    u16 pkey, const union ib_gid *gid,
 					    const struct sockaddr *addr);
Steve Wise Dec. 10, 2018, 4:27 p.m. UTC | #15
Hey Jason, A question on your patch inline below:

On 12/7/2018 1:25 PM, Jason Gunthorpe wrote:
> On Fri, Dec 07, 2018 at 10:12:44AM -0600, Steve Wise wrote:
>
>> Here is a way to ensure the module doesn't unload.  With this and my
>> current rxe changes, I think the races are all handled.
> I don't like messing with module refcount to solve a locking problem..
>
> Here something like this compile-tested thing..
>
> Instead of implementing dellink as a function call implement it as a
> flag 'ALLOW_USER_UNREGISTER'. The core code simply calls
> ib_unregister_device_and_put() upon dellink.
>
> rxe was full of lifetime bugs in this area, I think this fixes alot of
> them.. It probably replaces that other patch trying to do the same.
>
> >From 202ce0c783f0a2022ad49f5154be801b1d2a7b32 Mon Sep 17 00:00:00 2001
> From: Jason Gunthorpe <jgg@mellanox.com>
> Date: Fri, 7 Dec 2018 10:30:52 -0700
> Subject: [PATCH] verbs/rxe: Use core services to keep track of RXE things
>
> Replace
>  - The nonsense rxe_dev.kref with rxe_dev.ib_dev.dev.kref
>  - rxe_dev_list with core's device_list.
>  - Buggy net_to_rxe with ib_device_get_by_netdev that does proper
>    lifetime management
>  - Buggy get_rxe_by_name with ib_device_get_by_name
>
> Add
>  - ib_unregister_driver so drivers like rxe don't have to keep track
>    of things to do module unload
>  - ib_unregister_device_and_put to work with potiners returned by
>    that various ib_device_get_* functions
>
> The refcount change to the pool might need more work, but fundamentally
> the driver cannot have krefs open in the pool across unregistration or it
> contains module unlolad races.
>
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> ---
>  drivers/infiniband/core/core_priv.h   |   1 -
>  drivers/infiniband/core/device.c      | 210 ++++++++++++++++++++++++--
>  drivers/infiniband/sw/rxe/rxe.c       |  38 +----
>  drivers/infiniband/sw/rxe/rxe.h       |  14 +-
>  drivers/infiniband/sw/rxe/rxe_loc.h   |   2 +-
>  drivers/infiniband/sw/rxe/rxe_net.c   |  82 +++-------
>  drivers/infiniband/sw/rxe/rxe_pool.c  |   4 -
>  drivers/infiniband/sw/rxe/rxe_sysfs.c |  39 ++---
>  drivers/infiniband/sw/rxe/rxe_verbs.c |   7 -
>  drivers/infiniband/sw/rxe/rxe_verbs.h |   3 -
>  include/rdma/ib_verbs.h               |  10 ++
>  11 files changed, 257 insertions(+), 153 deletions(-)
>
> diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
> index cc7535c5e19233..1b2575430032d0 100644
> --- a/drivers/infiniband/core/core_priv.h
> +++ b/drivers/infiniband/core/core_priv.h
> @@ -267,7 +267,6 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
>  #endif
>  
>  struct ib_device *ib_device_get_by_index(u32 ifindex);
> -void ib_device_put(struct ib_device *device);
>  /* RDMA device netlink */
>  void nldev_init(void);
>  void nldev_exit(void);
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 0027b0d79b09e5..ec60ea3429b024 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -85,6 +85,9 @@ static LIST_HEAD(client_list);
>  static DEFINE_MUTEX(device_mutex);
>  static DECLARE_RWSEM(lists_rwsem);
>  
> +/* Used to serialize ib_unregister_driver */
> +static DECLARE_RWSEM(unregister_rwsem);
> +
>  static int ib_security_change(struct notifier_block *nb, unsigned long event,
>  			      void *lsm_data);
>  static void ib_policy_change_task(struct work_struct *work);
> @@ -168,6 +171,7 @@ void ib_device_put(struct ib_device *device)
>  	if (refcount_dec_and_test(&device->refcount))
>  		complete(&device->unreg_completion);
>  }
> +EXPORT_SYMBOL(ib_device_put);
>  
>  static struct ib_device *__ib_device_get_by_name(const char *name)
>  {
> @@ -180,6 +184,27 @@ static struct ib_device *__ib_device_get_by_name(const char *name)
>  	return NULL;
>  }
>  
> +struct ib_device *ib_device_get_by_name(const char *name,
> +					enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *device;
> +
> +	down_read(&lists_rwsem);
> +	device = __ib_device_get_by_name(name);
> +	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
> +	    device->driver_id != driver_id)
> +		device = NULL;
> +
> +	if (device) {
> +		/* Do not return a device if unregistration has started. */
> +		if (!refcount_inc_not_zero(&device->refcount))
> +			device = NULL;
> +	}
> +	up_read(&lists_rwsem);
> +	return device;
> +}
> +EXPORT_SYMBOL(ib_device_get_by_name);
> +
>  int ib_device_rename(struct ib_device *ibdev, const char *name)
>  {
>  	struct ib_device *device;
> @@ -641,28 +666,41 @@ int ib_register_device(struct ib_device *device, const char *name,
>  }
>  EXPORT_SYMBOL(ib_register_device);
>  
> -/**
> - * ib_unregister_device - Unregister an IB device
> - * @device:Device to unregister
> +/* Returns false if the device is already undergoing unregistration in another
> + * thread, and the device pointer may be invalid upon return.
>   *
> - * Unregister an IB device.  All clients will receive a remove callback.
> + * If true is returned then the caller owns a kref associated with the device
> + * (taken from the kref owned by the list).
> + *
> + * Upon return the device cannot be returned by any 'get' functions.
>   */
> -void ib_unregister_device(struct ib_device *device)
> +static bool ib_pre_unregister(struct ib_device *device)
> +{
> +	bool already_removed;
> +
> +	mutex_lock(&device_mutex);
> +	down_write(&lists_rwsem);
> +	already_removed = list_empty(&device->core_list);
> +	list_del_init(&device->core_list);
> +	up_write(&lists_rwsem);
> +	mutex_unlock(&device_mutex);
> +
> +	/* Matches the recound_set in ib_alloc_device */
> +	ib_device_put(device);
> +
> +	return !already_removed;
> +}
> +
> +static void __ib_unregister_device(struct ib_device *device)
>  {
>  	struct ib_client_data *context, *tmp;
>  	unsigned long flags;
>  
> -	/*
> -	 * Wait for all netlink command callers to finish working on the
> -	 * device.
> -	 */
> -	ib_device_put(device);
>  	wait_for_completion(&device->unreg_completion);
>  
>  	mutex_lock(&device_mutex);
>  
>  	down_write(&lists_rwsem);
> -	list_del(&device->core_list);
>  	write_lock_irq(&device->client_data_lock);
>  	list_for_each_entry(context, &device->client_data_list, list)
>  		context->going_down = true;
> @@ -696,9 +734,115 @@ void ib_unregister_device(struct ib_device *device)
>  	up_write(&lists_rwsem);
>  
>  	device->reg_state = IB_DEV_UNREGISTERED;
> +
> +	/*
> +	 * Drivers using the new flow may not call ib_dealloc_device except
> +	 * in error unwind prior to registration success.
> +	 */
> +	if (device->driver_unregister) {
> +		device->driver_unregister(device);
> +		ib_dealloc_device(device);
> +	}
> +}
> +
> +/**
> + * ib_unregister_device - Unregister an IB device
> + * @device: The device to unregister
> + *
> + * Unregister an IB device.  All clients will receive a remove callback.
> + *
> + * Callers can call this routine only once, and must protect against
> + * races. Typically it should only be called as part of a remove callback in
> + * an implementation of driver core's struct device_driver and related.
> + *
> + * A driver must choose to use either only ib_unregister_device or
> + * ib_unregister_device_and_put & ib_unregister_driver.
> + */
> +void ib_unregister_device(struct ib_device *device)
> +{
> +	down_read(&unregister_rwsem);
> +	if (ib_pre_unregister(device))
> +		__ib_unregister_device(device);
> +	else
> +		WARN(true, "Driver mixing ib_unregister_device with _and_put");
> +	up_read(&unregister_rwsem);
>  }
>  EXPORT_SYMBOL(ib_unregister_device);
>  
> +/**
> + * ib_unregister_device_and_put - Unregister a device while holding a 'get'
> + * device: The device to unregister
> + *
> + * This is the same as ib_unregister_device(), except it includes an internal
> + * ib_device_put() that should match a 'get' obtained by the caller.
> + *
> + * It is safe to call this routine concurrently from multiple threads while
> + * holding the 'get', however in this case return from the function does not
> + * guarantee the device is destroyed, only that destruction is in-progress.
> + *
> + * Drivers using this flow MUST use the driver_unregister callback to clean up
> + * their resources associated with the device and dealloc it.
> + */
> +void ib_unregister_device_and_put(struct ib_device *device)
> +{
> +	down_read(&unregister_rwsem);
> +	if (ib_pre_unregister(device)) {
> +		/*
> +		 * Caller's get can be returned now that we hold the kref,
> +		 * otherwise ib_pre_unregister returned the caller's get
> +		 */
> +		ib_device_put(device);
> +		__ib_unregister_device(device);
> +	}
> +	up_read(&unregister_rwsem);
> +}
> +EXPORT_SYMBOL(ib_unregister_device_and_put);
> +
> +/**
> + * ib_unregister_driver - Unregister all IB devices for a driver
> + * @driver_id: The driver to unregister
> + *
> + * This implements a fence for device unregistration. It only returns once all
> + * devices associated with the driver_id have fully completed their
> + * unregistration and returned from ib_unregister_device*().
> + *
> + * If device's are not yet unregistered it goes ahead and starts unregistering
> + * them.
> + *
> + * The caller must ensure that no devices can be be added while this is
> + * running (or at all for the module_unload case)
> + */
> +void ib_unregister_driver(enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *device;
> +	LIST_HEAD(driver_devs);
> +
> +	down_write(&unregister_rwsem);
> +	mutex_lock(&device_mutex);
> +	down_write(&lists_rwsem);
> +	list_for_each_entry(device, &device_list, core_list) {
> +		if (device->driver_id == driver_id)
> +			list_move(&device->core_list, &driver_devs);
> +	}
> +	up_write(&lists_rwsem);
> +
> +	while ((device = list_first_entry_or_null(
> +			&driver_devs, struct ib_device, core_list))) {
> +		mutex_unlock(&device_mutex);
> +		/*
> +		 * We are holding the unregister_rwsem, so it is impossible
> +		 * for another thread to be doing registration.
> +		 */
> +		if (!ib_pre_unregister(device))
> +			WARN(true, "Impossible failure of ib_pre_unregister");
> +		__ib_unregister_device(device);
> +		mutex_lock(&device_mutex);
> +	}
> +	mutex_unlock(&device_mutex);
> +	up_write(&unregister_rwsem);
> +}
> +EXPORT_SYMBOL(ib_unregister_driver);
> +
>  /**
>   * ib_register_client - Register an IB client
>   * @client:Client to register
> @@ -981,6 +1125,50 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
>  		}
>  }
>  
> +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
> +					  enum rdma_driver_id driver_id)
> +{
> +	struct ib_device *ib_dev;
> +
> +	down_read(&lists_rwsem);
> +	list_for_each_entry(ib_dev, &device_list, core_list) {
> +		unsigned int port;
> +
> +		if (driver_id != RDMA_DRIVER_UNKNOWN &&
> +		    ib_dev->driver_id != driver_id)
> +			continue;
> +
> +		if (!ib_dev->get_netdev)
> +			continue;
> +
> +		for (port = rdma_start_port(ib_dev);
> +		     port <= rdma_end_port(ib_dev); port++) {
> +			struct net_device *idev;
> +
> +			if (!rdma_protocol_roce(ib_dev, port))
> +				continue;
> +

Why only roce devices?

> +			idev = ib_dev->get_netdev(ib_dev, port);
> +			if (idev != ndev)
> +				continue;
> +
> +			/*
> +			 * Do not return a device if unregistration has
> +			 * started.
> +			 */
> +			if (!refcount_inc_not_zero(&ib_dev->refcount))
> +				continue;
> +
> +			up_read(&lists_rwsem);
> +			return ib_dev;
> +		}
> +	}
> +	up_read(&lists_rwsem);
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL(ib_device_get_by_netdev);
> +
>  /**
>   * ib_enum_all_roce_netdevs - enumerate all RoCE devices
>   * @filter: Should we call the callback?
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 383e65c7bbc057..4069463f9a6641 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -50,8 +50,10 @@ static void rxe_cleanup_ports(struct rxe_dev *rxe)
>  /* free resources for a rxe device all objects created for this device must
>   * have been destroyed
>   */
> -static void rxe_cleanup(struct rxe_dev *rxe)
> +void rxe_unregistered(struct ib_device *ib_dev)
>  {
> +	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
> +
>  	rxe_pool_cleanup(&rxe->uc_pool);
>  	rxe_pool_cleanup(&rxe->pd_pool);
>  	rxe_pool_cleanup(&rxe->ah_pool);
> @@ -68,15 +70,6 @@ static void rxe_cleanup(struct rxe_dev *rxe)
>  	crypto_free_shash(rxe->tfm);
>  }
>  
> -/* called when all references have been dropped */
> -void rxe_release(struct kref *kref)
> -{
> -	struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
> -
> -	rxe_cleanup(rxe);
> -	ib_dealloc_device(&rxe->ib_dev);
> -}
> -
>  /* initialize rxe device parameters */
>  static void rxe_init_device_param(struct rxe_dev *rxe)
>  {
> @@ -279,7 +272,6 @@ static int rxe_init(struct rxe_dev *rxe)
>  	spin_lock_init(&rxe->mmap_offset_lock);
>  	spin_lock_init(&rxe->pending_lock);
>  	INIT_LIST_HEAD(&rxe->pending_mmaps);
> -	INIT_LIST_HEAD(&rxe->list);
>  
>  	mutex_init(&rxe->usdev_lock);
>  
> @@ -312,31 +304,13 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
>  {
>  	int err;
>  
> -	kref_init(&rxe->ref_cnt);
> -
>  	err = rxe_init(rxe);
>  	if (err)
> -		goto err1;
> +		return err;
>  
>  	rxe_set_mtu(rxe, mtu);
>  
> -	err = rxe_register_device(rxe);
> -	if (err)
> -		goto err1;
> -
> -	return 0;
> -
> -err1:
> -	rxe_dev_put(rxe);
> -	return err;
> -}
> -
> -/* called by the ifc layer to remove a device */
> -void rxe_remove(struct rxe_dev *rxe)
> -{
> -	rxe_unregister_device(rxe);
> -
> -	rxe_dev_put(rxe);
> +	return rxe_register_device(rxe);
>  }
>  
>  static int __init rxe_module_init(void)
> @@ -360,7 +334,7 @@ static int __init rxe_module_init(void)
>  
>  static void __exit rxe_module_exit(void)
>  {
> -	rxe_remove_all();
> +	ib_unregister_driver(RDMA_DRIVER_RXE);
>  	rxe_net_exit();
>  	rxe_cache_exit();
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
> index 8f79bd86d0337f..75e7113a756d22 100644
> --- a/drivers/infiniband/sw/rxe/rxe.h
> +++ b/drivers/infiniband/sw/rxe/rxe.h
> @@ -96,17 +96,19 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
>  void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
>  
>  int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
> -void rxe_remove(struct rxe_dev *rxe);
> -void rxe_remove_all(void);
>  
>  void rxe_rcv(struct sk_buff *skb);
>  
> -static inline void rxe_dev_put(struct rxe_dev *rxe)
> +/* The caller must do a matching ib_device_put(&dev->ib_dev) */
> +static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
>  {
> -	kref_put(&rxe->ref_cnt, rxe_release);
> +	struct ib_device *ibdev =
> +		ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
> +
> +	if (!ibdev)
> +		return NULL;
> +	return container_of(ibdev, struct rxe_dev, ib_dev);
>  }
> -struct rxe_dev *net_to_rxe(struct net_device *ndev);
> -struct rxe_dev *get_rxe_by_name(const char *name);
>  
>  void rxe_port_up(struct rxe_dev *rxe);
>  void rxe_port_down(struct rxe_dev *rxe);
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index a675c9f2b42783..e830d56a15c2a4 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -231,7 +231,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
>  		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
>  		      struct rxe_modify_srq_cmd *ucmd);
>  
> -void rxe_release(struct kref *kref);
> +void rxe_unregistered(struct ib_device *ib_dev);
>  
>  int rxe_completer(void *arg);
>  int rxe_requester(void *arg);
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index b26a8141f3edcb..d1f5959d9013a5 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -45,43 +45,6 @@
>  #include "rxe_net.h"
>  #include "rxe_loc.h"
>  
> -static LIST_HEAD(rxe_dev_list);
> -static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
> -
> -struct rxe_dev *net_to_rxe(struct net_device *ndev)
> -{
> -	struct rxe_dev *rxe;
> -	struct rxe_dev *found = NULL;
> -
> -	spin_lock_bh(&dev_list_lock);
> -	list_for_each_entry(rxe, &rxe_dev_list, list) {
> -		if (rxe->ndev == ndev) {
> -			found = rxe;
> -			break;
> -		}
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -
> -	return found;
> -}
> -
> -struct rxe_dev *get_rxe_by_name(const char *name)
> -{
> -	struct rxe_dev *rxe;
> -	struct rxe_dev *found = NULL;
> -
> -	spin_lock_bh(&dev_list_lock);
> -	list_for_each_entry(rxe, &rxe_dev_list, list) {
> -		if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) {
> -			found = rxe;
> -			break;
> -		}
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -	return found;
> -}
> -
> -
>  static struct rxe_recv_sockets recv_sockets;
>  
>  struct device *rxe_dma_device(struct rxe_dev *rxe)
> @@ -229,12 +192,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>  	struct udphdr *udph;
>  	struct net_device *ndev = skb->dev;
>  	struct net_device *rdev = ndev;
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
>  	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
>  
>  	if (!rxe && is_vlan_dev(rdev)) {
>  		rdev = vlan_dev_real_dev(ndev);
> -		rxe = net_to_rxe(rdev);
> +		rxe = rxe_get_dev_from_net(rdev);
>  	}
>  	if (!rxe)
>  		goto drop;
> @@ -253,6 +216,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>  
>  	rxe_rcv(skb);
>  
> +	/*
> +	 * FIXME: this is in the wrong place, it needs to be done when pkt is
> +	 * destroyed
> +	 */
> +	ib_device_put(&rxe->ib_dev);
> +
>  	return 0;
>  drop:
>  	kfree_skb(skb);
> @@ -558,36 +527,18 @@ struct rxe_dev *rxe_net_add(struct net_device *ndev)
>  	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
>  	if (!rxe)
>  		return NULL;
> -
> +	rxe->ib_dev.driver_unregister = rxe_unregistered;
>  	rxe->ndev = ndev;
>  
>  	err = rxe_add(rxe, ndev->mtu);
>  	if (err) {
> -		ib_dealloc_device(&rxe->ib_dev);
> +		rxe_unregistered(&rxe->ib_dev);
>  		return NULL;
>  	}
>  
> -	spin_lock_bh(&dev_list_lock);
> -	list_add_tail(&rxe->list, &rxe_dev_list);
> -	spin_unlock_bh(&dev_list_lock);
>  	return rxe;
>  }
>  
> -void rxe_remove_all(void)
> -{
> -	spin_lock_bh(&dev_list_lock);
> -	while (!list_empty(&rxe_dev_list)) {
> -		struct rxe_dev *rxe =
> -			list_first_entry(&rxe_dev_list, struct rxe_dev, list);
> -
> -		list_del(&rxe->list);
> -		spin_unlock_bh(&dev_list_lock);
> -		rxe_remove(rxe);
> -		spin_lock_bh(&dev_list_lock);
> -	}
> -	spin_unlock_bh(&dev_list_lock);
> -}
> -
>  static void rxe_port_event(struct rxe_dev *rxe,
>  			   enum ib_event_type event)
>  {
> @@ -630,16 +581,16 @@ static int rxe_notify(struct notifier_block *not_blk,
>  		      void *arg)
>  {
>  	struct net_device *ndev = netdev_notifier_info_to_dev(arg);
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
> +	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
>  
>  	if (!rxe)
> -		goto out;
> +		return NOTIFY_OK;
>  
>  	switch (event) {
>  	case NETDEV_UNREGISTER:
> -		list_del(&rxe->list);
> -		rxe_remove(rxe);
> -		break;
> +		ib_unregister_device_and_put(&rxe->ib_dev);
> +		return NOTIFY_OK;
> +
>  	case NETDEV_UP:
>  		rxe_port_up(rxe);
>  		break;
> @@ -666,7 +617,8 @@ static int rxe_notify(struct notifier_block *not_blk,
>  			event, ndev->name);
>  		break;
>  	}
> -out:
> +
> +	ib_device_put(&rxe->ib_dev);
>  	return NOTIFY_OK;
>  }
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> index a04a076e03afbd..bb7e3268713ed6 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> @@ -390,8 +390,6 @@ void *rxe_alloc(struct rxe_pool *pool)
>  	kref_get(&pool->ref_cnt);
>  	read_unlock_irqrestore(&pool->pool_lock, flags);
>  
> -	kref_get(&pool->rxe->ref_cnt);
> -
>  	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>  		goto out_put_pool;
>  
> @@ -408,7 +406,6 @@ void *rxe_alloc(struct rxe_pool *pool)
>  
>  out_put_pool:
>  	atomic_dec(&pool->num_elem);
> -	rxe_dev_put(pool->rxe);
>  	rxe_pool_put(pool);
>  	return NULL;
>  }
> @@ -424,7 +421,6 @@ void rxe_elem_release(struct kref *kref)
>  
>  	kmem_cache_free(pool_cache(pool), elem);
>  	atomic_dec(&pool->num_elem);
> -	rxe_dev_put(pool->rxe);
>  	rxe_pool_put(pool);
>  }
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
> index 73a19f808e1bab..87c17675cf9d9e 100644
> --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
> @@ -53,20 +53,14 @@ static int sanitize_arg(const char *val, char *intf, int intf_len)
>  	return len;
>  }
>  
> -static void rxe_set_port_state(struct net_device *ndev)
> +static void rxe_set_port_state(struct rxe_dev *rxe, struct net_device *ndev)
>  {
> -	struct rxe_dev *rxe = net_to_rxe(ndev);
>  	bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
>  
> -	if (!rxe)
> -		goto out;
> -
>  	if (is_up)
>  		rxe_port_up(rxe);
>  	else
>  		rxe_port_down(rxe); /* down for unknown state */
> -out:
> -	return;
>  }
>  
>  static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
> @@ -74,24 +68,25 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
>  	int len;
>  	int err = 0;
>  	char intf[32];
> -	struct net_device *ndev = NULL;
> +	struct net_device *ndev;
> +	struct rxe_dev *exists;
>  	struct rxe_dev *rxe;
>  
>  	len = sanitize_arg(val, intf, sizeof(intf));
>  	if (!len) {
>  		pr_err("add: invalid interface name\n");
> -		err = -EINVAL;
> -		goto err;
> +		return -EINVAL;
>  	}
>  
>  	ndev = dev_get_by_name(&init_net, intf);
>  	if (!ndev) {
>  		pr_err("interface %s not found\n", intf);
> -		err = -EINVAL;
> -		goto err;
> +		return -EINVAL;
>  	}
>  
> -	if (net_to_rxe(ndev)) {
> +	exists = rxe_get_dev_from_net(ndev);
> +	if (exists) {
> +		ib_device_put(&exists->ib_dev);
>  		pr_err("already configured on %s\n", intf);
>  		err = -EINVAL;
>  		goto err;
> @@ -104,11 +99,11 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
>  		goto err;
>  	}
>  
> -	rxe_set_port_state(ndev);
> +	rxe_set_port_state(rxe, ndev);
>  	dev_info(&rxe->ib_dev.dev, "added %s\n", intf);
> +
>  err:
> -	if (ndev)
> -		dev_put(ndev);
> +	dev_put(ndev);
>  	return err;
>  }
>  
> @@ -116,7 +111,7 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
>  {
>  	int len;
>  	char intf[32];
> -	struct rxe_dev *rxe;
> +	struct ib_device *ib_dev;
>  
>  	len = sanitize_arg(val, intf, sizeof(intf));
>  	if (!len) {
> @@ -126,20 +121,18 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
>  
>  	if (strncmp("all", intf, len) == 0) {
>  		pr_info("rxe_sys: remove all");
> -		rxe_remove_all();
> +		ib_unregister_driver(RDMA_DRIVER_RXE);
>  		return 0;
>  	}
>  
> -	rxe = get_rxe_by_name(intf);
> +	ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE);
>  
> -	if (!rxe) {
> +	if (!ib_dev) {
>  		pr_err("not configured on %s\n", intf);
>  		return -EINVAL;
>  	}
>  
> -	list_del(&rxe->list);
> -	rxe_remove(rxe);
> -
> +	ib_unregister_device_and_put(ib_dev);
>  	return 0;
>  }
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 30817c79ba962f..28beee8f583823 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1286,10 +1286,3 @@ int rxe_register_device(struct rxe_dev *rxe)
>  
>  	return err;
>  }
> -
> -void rxe_unregister_device(struct rxe_dev *rxe)
> -{
> -	struct ib_device *dev = &rxe->ib_dev;
> -
> -	ib_unregister_device(dev);
> -}
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 831381b7788da9..a0b63581278420 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -385,7 +385,6 @@ struct rxe_dev {
>  	struct ib_device_attr	attr;
>  	int			max_ucontext;
>  	int			max_inline_data;
> -	struct kref		ref_cnt;
>  	struct mutex	usdev_lock;
>  
>  	struct net_device	*ndev;
> @@ -412,7 +411,6 @@ struct rxe_dev {
>  	u64			stats_counters[RXE_NUM_OF_COUNTERS];
>  
>  	struct rxe_port		port;
> -	struct list_head	list;
>  	struct crypto_shash	*tfm;
>  };
>  
> @@ -467,7 +465,6 @@ static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
>  }
>  
>  int rxe_register_device(struct rxe_dev *rxe);
> -void rxe_unregister_device(struct rxe_dev *rxe);
>  
>  void rxe_mc_cleanup(struct rxe_pool_entry *arg);
>  
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 85021451eee073..5fbeedacb3aa88 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -2534,6 +2534,9 @@ struct ib_device {
>  				 struct ib_counters_read_attr *counters_read_attr,
>  				 struct uverbs_attr_bundle *attrs);
>  
> +	/* Called after all clients have been destroyed*/
> +	void (*driver_unregister)(struct ib_device *dev);
> +
>  	/**
>  	 * rdma netdev operation
>  	 *
> @@ -2653,6 +2656,8 @@ int ib_register_device(struct ib_device *device, const char *name,
>  		       int (*port_callback)(struct ib_device *, u8,
>  					    struct kobject *));
>  void ib_unregister_device(struct ib_device *device);
> +void ib_unregister_driver(enum rdma_driver_id driver_id);
> +void ib_unregister_device_and_put(struct ib_device *device);
>  
>  int ib_register_client   (struct ib_client *client);
>  void ib_unregister_client(struct ib_client *client);
> @@ -3937,6 +3942,11 @@ static inline bool ib_access_writable(int access_flags)
>  int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
>  		       struct ib_mr_status *mr_status);
>  
> +void ib_device_put(struct ib_device *device);
> +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
> +					  enum rdma_driver_id driver_id);
> +struct ib_device *ib_device_get_by_name(const char *name,
> +					enum rdma_driver_id driver_id);
>  struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
>  					    u16 pkey, const union ib_gid *gid,
>  					    const struct sockaddr *addr);
Jason Gunthorpe Dec. 10, 2018, 5:17 p.m. UTC | #16
On Mon, Dec 10, 2018 at 10:27:47AM -0600, Steve Wise wrote:

> > +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
> > +					  enum rdma_driver_id driver_id)
> > +{
> > +	struct ib_device *ib_dev;
> > +
> > +	down_read(&lists_rwsem);
> > +	list_for_each_entry(ib_dev, &device_list, core_list) {
> > +		unsigned int port;
> > +
> > +		if (driver_id != RDMA_DRIVER_UNKNOWN &&
> > +		    ib_dev->driver_id != driver_id)
> > +			continue;
> > +
> > +		if (!ib_dev->get_netdev)
> > +			continue;
> > +
> > +		for (port = rdma_start_port(ib_dev);
> > +		     port <= rdma_end_port(ib_dev); port++) {
> > +			struct net_device *idev;
> > +
> > +			if (!rdma_protocol_roce(ib_dev, port))
> > +				continue;
> > +
> 
> Why only roce devices?

No specific reason, this pattern was just in the code above I copied
this from. Both places probably need some revision to support iwarp.
 
Jason
Steve Wise Dec. 10, 2018, 7:47 p.m. UTC | #17
On 12/7/2018 1:25 PM, Jason Gunthorpe wrote:
> I don't like messing with module refcount to solve a locking problem..
> Here something like this compile-tested thing..
>
> Instead of implementing dellink as a function call implement it as a
> flag 'ALLOW_USER_UNREGISTER'. The core code simply calls
> ib_unregister_device_and_put() upon dellink.
>
Why don't we just mandate that if a driver uses rdma_link_ops, then it
must support the core doing the ib_unregister_device_and_put() on a
DELLINK nlmsg?

Steve.
Jason Gunthorpe Dec. 10, 2018, 8:07 p.m. UTC | #18
On Mon, Dec 10, 2018 at 01:47:39PM -0600, Steve Wise wrote:
> 
> On 12/7/2018 1:25 PM, Jason Gunthorpe wrote:
> > I don't like messing with module refcount to solve a locking problem..
> > Here something like this compile-tested thing..
> >
> > Instead of implementing dellink as a function call implement it as a
> > flag 'ALLOW_USER_UNREGISTER'. The core code simply calls
> > ib_unregister_device_and_put() upon dellink.
> >
> Why don't we just mandate that if a driver uses rdma_link_ops, then it
> must support the core doing the ib_unregister_device_and_put() on a
> DELLINK nlmsg?

Yes, exactly.

But conversely we must prevent dellink from running if the device was
not created by a compatible driver

Jason
Steve Wise Dec. 10, 2018, 8:15 p.m. UTC | #19
On 12/10/2018 2:07 PM, Jason Gunthorpe wrote:
> On Mon, Dec 10, 2018 at 01:47:39PM -0600, Steve Wise wrote:
>> On 12/7/2018 1:25 PM, Jason Gunthorpe wrote:
>>> I don't like messing with module refcount to solve a locking problem..
>>> Here something like this compile-tested thing..
>>>
>>> Instead of implementing dellink as a function call implement it as a
>>> flag 'ALLOW_USER_UNREGISTER'. The core code simply calls
>>> ib_unregister_device_and_put() upon dellink.
>>>
>> Why don't we just mandate that if a driver uses rdma_link_ops, then it
>> must support the core doing the ib_unregister_device_and_put() on a
>> DELLINK nlmsg?
> Yes, exactly.
>
> But conversely we must prevent dellink from running if the device was
> not created by a compatible driver

That'll be handled by virtue that only devices created via NEWLINK will
have the driver's link_ops pointer cached in the ib_device.
Jason Gunthorpe Dec. 10, 2018, 8:26 p.m. UTC | #20
On Mon, Dec 10, 2018 at 02:15:18PM -0600, Steve Wise wrote:
> 
> On 12/10/2018 2:07 PM, Jason Gunthorpe wrote:
> > On Mon, Dec 10, 2018 at 01:47:39PM -0600, Steve Wise wrote:
> >> On 12/7/2018 1:25 PM, Jason Gunthorpe wrote:
> >>> I don't like messing with module refcount to solve a locking problem..
> >>> Here something like this compile-tested thing..
> >>>
> >>> Instead of implementing dellink as a function call implement it as a
> >>> flag 'ALLOW_USER_UNREGISTER'. The core code simply calls
> >>> ib_unregister_device_and_put() upon dellink.
> >>>
> >> Why don't we just mandate that if a driver uses rdma_link_ops, then it
> >> must support the core doing the ib_unregister_device_and_put() on a
> >> DELLINK nlmsg?
> > Yes, exactly.
> >
> > But conversely we must prevent dellink from running if the device was
> > not created by a compatible driver
> 
> That'll be handled by virtue that only devices created via NEWLINK will
> have the driver's link_ops pointer cached in the ib_device.

this is fine, but do we need link_ops to be stored if we are just
calling ib_unregister_device_and_put() There is not really anyneed for
a dellink callback, right?

Jason
Steve Wise Dec. 10, 2018, 8:43 p.m. UTC | #21
On 12/10/2018 2:26 PM, Jason Gunthorpe wrote:
> On Mon, Dec 10, 2018 at 02:15:18PM -0600, Steve Wise wrote:
>> On 12/10/2018 2:07 PM, Jason Gunthorpe wrote:
>>> On Mon, Dec 10, 2018 at 01:47:39PM -0600, Steve Wise wrote:
>>>> On 12/7/2018 1:25 PM, Jason Gunthorpe wrote:
>>>>> I don't like messing with module refcount to solve a locking problem..
>>>>> Here something like this compile-tested thing..
>>>>>
>>>>> Instead of implementing dellink as a function call implement it as a
>>>>> flag 'ALLOW_USER_UNREGISTER'. The core code simply calls
>>>>> ib_unregister_device_and_put() upon dellink.
>>>>>
>>>> Why don't we just mandate that if a driver uses rdma_link_ops, then it
>>>> must support the core doing the ib_unregister_device_and_put() on a
>>>> DELLINK nlmsg?
>>> Yes, exactly.
>>>
>>> But conversely we must prevent dellink from running if the device was
>>> not created by a compatible driver
>> That'll be handled by virtue that only devices created via NEWLINK will
>> have the driver's link_ops pointer cached in the ib_device.
> this is fine, but do we need link_ops to be stored if we are just
> calling ib_unregister_device_and_put() There is not really anyneed for
> a dellink callback, right?
>
> Jason

Correct.  I get it.  So we just need a flag indicating this device can
be deleted via DELLINK.  (just like you said earlier! :) )

It might take a while, but eventually things do sink into my noggin'. ;)

Steve.
diff mbox series

Patch

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 9abbadb9e366..b6d97c592074 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -33,6 +33,7 @@ 
 #include <linux/module.h>
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/mutex.h>
 #include <net/netlink.h>
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_netlink.h>
@@ -107,6 +108,8 @@ 
 	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
 	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
+				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -1104,6 +1107,129 @@  static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
 	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
 }
 
+static LIST_HEAD(link_ops);
+static DECLARE_RWSEM(link_ops_rwsem);
+
+static const struct rdma_link_ops *link_ops_get(const char *type)
+{
+	const struct rdma_link_ops *ops;
+
+	list_for_each_entry(ops, &link_ops, list) {
+		if (!strcmp(ops->type, type))
+			goto out;
+	}
+	ops = NULL;
+out:
+	return ops;
+}
+
+void rdma_link_register(struct rdma_link_ops *ops)
+{
+	down_write(&link_ops_rwsem);
+	if (link_ops_get(ops->type)) {
+		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
+		goto out;
+	}
+	list_add(&ops->list, &link_ops);
+out:
+	up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_register);
+
+void rdma_link_unregister(struct rdma_link_ops *ops)
+{
+	down_write(&link_ops_rwsem);
+	list_del(&ops->list);
+	up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_unregister);
+
+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	char ibdev_name[IB_DEVICE_NAME_MAX];
+	const struct rdma_link_ops *ops;
+	struct ib_device *device;
+	char ndev_name[IFNAMSIZ];
+	char type[IFNAMSIZ];
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
+		return -EINVAL;
+
+	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+		    sizeof(ibdev_name));
+	if (strchr(ibdev_name, '%'))
+		return -EINVAL;
+
+	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
+	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
+		    sizeof(ndev_name));
+
+	down_read(&link_ops_rwsem);
+	ops = link_ops_get(type);
+#ifdef CONFIG_MODULES
+	if (!ops) {
+		up_read(&link_ops_rwsem);
+		request_module("rdma-link-%s", type);
+		down_read(&link_ops_rwsem);
+		ops = link_ops_get(type);
+	}
+#endif
+	if (ops) {
+		device = ops->newlink(ibdev_name, ndev_name);
+		if (IS_ERR(device))
+			err = PTR_ERR(device);
+		else
+			device->link_ops = ops;
+	} else {
+		err = -ENODEV;
+	}
+	up_read(&link_ops_rwsem);
+
+	return err;
+}
+
+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	const struct rdma_link_ops *ops;
+	struct device *dma_device;
+	struct ib_device *device;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	ops = device->link_ops;
+
+	/*
+	 * Deref the ib_device before deleting it.  Otherwise we
+	 * deadlock unregistering the device.  Hold a ref on the
+	 * underlying dma_device though to keep the memory around
+	 * until we're done.
+	 */
+	dma_device = get_device(device->dma_device);
+	ib_device_put(device);
+	err = ops ? ops->dellink(device) : -ENODEV;
+	put_device(dma_device);
+
+	return err;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
@@ -1113,6 +1239,14 @@  static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
 		.doit = nldev_set_doit,
 		.flags = RDMA_NL_ADMIN_PERM,
 	},
+	[RDMA_NLDEV_CMD_NEWLINK] = {
+		.doit = nldev_newlink,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NLDEV_CMD_DELLINK] = {
+		.doit = nldev_dellink,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NLDEV_CMD_PORT_GET] = {
 		.doit = nldev_port_get_doit,
 		.dump = nldev_port_get_dumpit,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 85021451eee0..237d87c52e3a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2612,6 +2612,8 @@  struct ib_device {
 	 */
 	refcount_t refcount;
 	struct completion unreg_completion;
+
+	const struct rdma_link_ops     *link_ops;
 };
 
 struct ib_client {
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 70218e6b5187..9b50e957cbae 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -99,4 +99,17 @@  int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
  * Returns true on success or false if no listeners.
  */
 bool rdma_nl_chk_listeners(unsigned int group);
+
+struct rdma_link_ops {
+	struct list_head list;
+	const char *type;
+	struct ib_device *(*newlink)(const char *ibdev_name,
+				     const char *ndev_name);
+	int (*dellink)(struct ib_device *ibdev);
+};
+
+void rdma_link_register(struct rdma_link_ops *ops);
+void rdma_link_unregister(struct rdma_link_ops *ops);
+
+#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type)
 #endif /* _RDMA_NETLINK_H */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index f9c41bf59efc..dfdfc2b608b8 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -229,9 +229,11 @@  enum rdma_nldev_command {
 	RDMA_NLDEV_CMD_GET, /* can dump */
 	RDMA_NLDEV_CMD_SET,
 
-	/* 3 - 4 are free to use */
+	RDMA_NLDEV_CMD_NEWLINK,
 
-	RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */
+	RDMA_NLDEV_CMD_DELLINK,
+
+	RDMA_NLDEV_CMD_PORT_GET, /* can dump */
 
 	/* 6 - 8 are free to use */
 
@@ -428,6 +430,11 @@  enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_DRIVER_U64,		/* u64 */
 
 	/*
+	 * Identifies the rdma driver. eg: "rxe" or "siw"
+	 */
+	RDMA_NLDEV_ATTR_LINK_TYPE,		/* string */
+
+	/*
 	 * Always the end
 	 */
 	RDMA_NLDEV_ATTR_MAX