Message ID | 0a9f1a548638a70242af3b0a783756ca2bf985ef.1543359232.git.swise@opengridcomputing.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | Dynamic rdma link creation | expand |
>-----Original Message----- >From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- >owner@vger.kernel.org] On Behalf Of Steve Wise >Sent: Tuesday, November 27, 2018 10:37 AM >To: dledford@redhat.com; jgg@mellanox.com >Cc: linux-rdma@vger.kernel.org; BMT@zurich.ibm.com >Subject: [PATCH v3 1/2] RDMA/Core: add >RDMA_NLDEV_CMD_NEWLINK/DELLINK support > >Add support for new LINK messages to allow adding and deleting rdma >interfaces. This will be used initially for soft rdma drivers which >instantiate device instances dynamically by the admin specifying a >netdev device to use. The rdma_rxe module will be the first user of >these messages. > >The design is modeled after RTNL_NEWLINK/DELLINK: rdma drivers >register with the rdma core if they provide link add/delete functions. >Each driver registers with a unique "type" string, that is used to >dispatch messages coming from user space. A new RDMA_NLDEV_ATTR is >defined for the "type" string. User mode will pass 3 attributes in a >NEWLINK message: RDMA_NLDEV_ATTR_IBDEV_NAME for the desired rdma >device >name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type" of link >being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device interface >to use for this link. The DELLINK message will contain the IBDEV_NAME >and LINK_TYPE attributes. > >Signed-off-by: Steve Wise <swise@opengridcomputing.com> >Reviewed-by: Leon Romanovsky <leonro@mellanox.com> >--- > drivers/infiniband/core/nldev.c | 131 >+++++++++++++++++++++++++++++++++++++++ > include/rdma/rdma_netlink.h | 12 ++++ > include/uapi/rdma/rdma_netlink.h | 11 +++- > 3 files changed, 152 insertions(+), 2 deletions(-) > >diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c >index 63cc74483188..04d8af0ccb25 100644 >--- a/drivers/infiniband/core/nldev.c >+++ b/drivers/infiniband/core/nldev.c >@@ -33,6 +33,7 @@ > #include <linux/module.h> > #include <linux/pid.h> > #include <linux/pid_namespace.h> >+#include <linux/mutex.h> > #include <net/netlink.h> > #include <rdma/rdma_cm.h> > #include <rdma/rdma_netlink.h> >@@ -107,6 +108,8 @@ > [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, > [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, > [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, >+ [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = >NLA_NUL_STRING, >+ .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, > }; > > static int put_driver_name_print_type(struct sk_buff *msg, const char *name, >@@ -1103,6 +1106,126 @@ static int nldev_res_get_pd_dumpit(struct sk_buff >*skb, > return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD); > } > >+static LIST_HEAD(link_ops); >+static DEFINE_MUTEX(link_ops_mutex); >+ >+static const struct rdma_link_ops *link_ops_get(const char *type) >+{ >+ const struct rdma_link_ops *ops; >+ >+ list_for_each_entry(ops, &link_ops, list) { >+ if (!strcmp(ops->type, type)) >+ goto out; >+ } >+ ops = NULL; >+out: >+ return ops; >+} >+ >+void rdma_link_register(struct rdma_link_ops *ops) >+{ >+ mutex_lock(&link_ops_mutex); >+ if (link_ops_get(ops->type)) { >+ WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type); >+ goto out; >+ } >+ list_add(&ops->list, &link_ops); >+out: >+ mutex_unlock(&link_ops_mutex); >+} >+EXPORT_SYMBOL(rdma_link_register); >+ >+void rdma_link_unregister(struct rdma_link_ops *ops) >+{ >+ mutex_lock(&link_ops_mutex); >+ list_del(&ops->list); >+ mutex_unlock(&link_ops_mutex); >+} >+EXPORT_SYMBOL(rdma_link_unregister); >+ >+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, >+ struct netlink_ext_ack *extack) >+{ >+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; >+ char ibdev_name[IB_DEVICE_NAME_MAX]; >+ const struct rdma_link_ops *ops; >+ char ndev_name[IFNAMSIZ]; >+ char type[IFNAMSIZ]; >+ int err; >+ >+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, >+ nldev_policy, extack); >+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || >+ !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || >!tb[RDMA_NLDEV_ATTR_NDEV_NAME]) >+ return -EINVAL; >+ >+ if (nla_len(tb[RDMA_NLDEV_ATTR_DEV_NAME]) > sizeof(ibdev_name) || >+ nla_len(tb[RDMA_NLDEV_ATTR_LINK_TYPE]) > sizeof(type) || >+ nla_len(tb[RDMA_NLDEV_ATTR_NDEV_NAME]) > sizeof(ndev_name)) >+ return -EINVAL; >+ >+ nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], >+ sizeof(ibdev_name)); >+ if (strchr(ibdev_name, '%')) >+ return -EINVAL; >+ >+ nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); >+ nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], >+ sizeof(ndev_name)); >+ >+ mutex_lock(&link_ops_mutex); >+ ops = link_ops_get(type); >+#ifdef CONFIG_MODULES >+ if (!ops) { >+ mutex_unlock(&link_ops_mutex); >+ request_module("rdma-link-%s", type); >+ mutex_lock(&link_ops_mutex); >+ ops = link_ops_get(type); >+ } >+#endif >+ if (ops) >+ err = ops->newlink(ibdev_name, ndev_name); >+ else >+ err = -ENODEV; >+ mutex_unlock(&link_ops_mutex); Hi Steve, Should the mutex be held while calling the callback? If the newlink or dellink callbacks need to sleep (probably dellink specifically), I think holding the mutex might not be a good idea. Mike >+ return err; >+} >+ >+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, >+ struct netlink_ext_ack *extack) >+{ >+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; >+ char ibdev_name[IB_DEVICE_NAME_MAX]; >+ const struct rdma_link_ops *ops; >+ char type[IFNAMSIZ]; >+ int err; >+ >+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, >+ nldev_policy, extack); >+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || >+ !tb[RDMA_NLDEV_ATTR_LINK_TYPE]) >+ return -EINVAL; >+ >+ if (nla_len(tb[RDMA_NLDEV_ATTR_DEV_NAME]) > sizeof(ibdev_name) || >+ nla_len(tb[RDMA_NLDEV_ATTR_LINK_TYPE]) > sizeof(type)) >+ return -EINVAL; >+ >+ nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], >+ sizeof(ibdev_name)); >+ nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); >+ >+ mutex_lock(&link_ops_mutex); >+ ops = link_ops_get(type); >+ if (ops) >+ err = ops->dellink(ibdev_name); >+ else >+ err = -ENODEV; >+ mutex_unlock(&link_ops_mutex); >+ >+ return err; >+} >+ > static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { > [RDMA_NLDEV_CMD_GET] = { > .doit = nldev_get_doit, >@@ -1112,6 +1235,14 @@ static int nldev_res_get_pd_dumpit(struct sk_buff >*skb, > .doit = nldev_set_doit, > .flags = RDMA_NL_ADMIN_PERM, > }, >+ [RDMA_NLDEV_CMD_NEWLINK] = { >+ .doit = nldev_newlink, >+ .flags = RDMA_NL_ADMIN_PERM, >+ }, >+ [RDMA_NLDEV_CMD_DELLINK] = { >+ .doit = nldev_dellink, >+ .flags = RDMA_NL_ADMIN_PERM, >+ }, > [RDMA_NLDEV_CMD_PORT_GET] = { > .doit = nldev_port_get_doit, > .dump = nldev_port_get_dumpit, >diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h >index 70218e6b5187..42daca1589bd 100644 >--- a/include/rdma/rdma_netlink.h >+++ b/include/rdma/rdma_netlink.h >@@ -99,4 +99,16 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, > * Returns true on success or false if no listeners. > */ > bool rdma_nl_chk_listeners(unsigned int group); >+ >+struct rdma_link_ops { >+ struct list_head list; >+ const char *type; >+ int (*newlink)(const char *ibdev_name, const char *ndev_name); >+ int (*dellink)(const char *ibdev_name); >+}; >+ >+void rdma_link_register(struct rdma_link_ops *ops); >+void rdma_link_unregister(struct rdma_link_ops *ops); >+ >+#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type) > #endif /* _RDMA_NETLINK_H */ >diff --git a/include/uapi/rdma/rdma_netlink.h >b/include/uapi/rdma/rdma_netlink.h >index f9c41bf59efc..dfdfc2b608b8 100644 >--- a/include/uapi/rdma/rdma_netlink.h >+++ b/include/uapi/rdma/rdma_netlink.h >@@ -229,9 +229,11 @@ enum rdma_nldev_command { > RDMA_NLDEV_CMD_GET, /* can dump */ > RDMA_NLDEV_CMD_SET, > >- /* 3 - 4 are free to use */ >+ RDMA_NLDEV_CMD_NEWLINK, > >- RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ >+ RDMA_NLDEV_CMD_DELLINK, >+ >+ RDMA_NLDEV_CMD_PORT_GET, /* can dump */ > > /* 6 - 8 are free to use */ > >@@ -428,6 +430,11 @@ enum rdma_nldev_attr { > RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ > > /* >+ * Identifies the rdma driver. eg: "rxe" or "siw" >+ */ >+ RDMA_NLDEV_ATTR_LINK_TYPE, /* string */ >+ >+ /* > * Always the end > */ > RDMA_NLDEV_ATTR_MAX >-- >1.8.3.1
On Fri, Nov 30, 2018 at 01:29:22AM +0000, Ruhl, Michael J wrote: > Should the mutex be held while calling the callback? If the newlink or dellink > callbacks need to sleep (probably dellink specifically), I think holding the mutex > might not be a good idea. It would be a bit better if this lock was a rwsem, but it must be held whenever ops is touched.. This is probably a moot point though when dellink goes through the ib_dev not the ops list.. Jason
Hey Mike, On 11/29/2018 7:29 PM, Ruhl, Michael J wrote: >> -----Original Message----- >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- >> owner@vger.kernel.org] On Behalf Of Steve Wise >> Sent: Tuesday, November 27, 2018 10:37 AM >> To: dledford@redhat.com; jgg@mellanox.com >> Cc: linux-rdma@vger.kernel.org; BMT@zurich.ibm.com >> Subject: [PATCH v3 1/2] RDMA/Core: add >> RDMA_NLDEV_CMD_NEWLINK/DELLINK support >> >> Add support for new LINK messages to allow adding and deleting rdma >> interfaces. This will be used initially for soft rdma drivers which >> instantiate device instances dynamically by the admin specifying a >> netdev device to use. The rdma_rxe module will be the first user of >> these messages. >> >> The design is modeled after RTNL_NEWLINK/DELLINK: rdma drivers >> register with the rdma core if they provide link add/delete functions. >> Each driver registers with a unique "type" string, that is used to >> dispatch messages coming from user space. A new RDMA_NLDEV_ATTR is >> defined for the "type" string. User mode will pass 3 attributes in a >> NEWLINK message: RDMA_NLDEV_ATTR_IBDEV_NAME for the desired rdma >> device >> name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type" of link >> being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device interface >> to use for this link. The DELLINK message will contain the IBDEV_NAME >> and LINK_TYPE attributes. >> >> Signed-off-by: Steve Wise <swise@opengridcomputing.com> >> Reviewed-by: Leon Romanovsky <leonro@mellanox.com> >> --- >> drivers/infiniband/core/nldev.c | 131 >> +++++++++++++++++++++++++++++++++++++++ >> include/rdma/rdma_netlink.h | 12 ++++ >> include/uapi/rdma/rdma_netlink.h | 11 +++- >> 3 files changed, 152 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c >> index 63cc74483188..04d8af0ccb25 100644 >> --- a/drivers/infiniband/core/nldev.c >> +++ b/drivers/infiniband/core/nldev.c >> @@ -33,6 +33,7 @@ >> #include <linux/module.h> >> #include <linux/pid.h> >> #include <linux/pid_namespace.h> >> +#include <linux/mutex.h> >> #include <net/netlink.h> >> #include <rdma/rdma_cm.h> >> #include <rdma/rdma_netlink.h> >> @@ -107,6 +108,8 @@ >> [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, >> [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, >> [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, >> + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = >> NLA_NUL_STRING, >> + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, >> }; >> >> static int put_driver_name_print_type(struct sk_buff *msg, const char *name, >> @@ -1103,6 +1106,126 @@ static int nldev_res_get_pd_dumpit(struct sk_buff >> *skb, >> return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD); >> } >> >> +static LIST_HEAD(link_ops); >> +static DEFINE_MUTEX(link_ops_mutex); >> + >> +static const struct rdma_link_ops *link_ops_get(const char *type) >> +{ >> + const struct rdma_link_ops *ops; >> + >> + list_for_each_entry(ops, &link_ops, list) { >> + if (!strcmp(ops->type, type)) >> + goto out; >> + } >> + ops = NULL; >> +out: >> + return ops; >> +} >> + >> +void rdma_link_register(struct rdma_link_ops *ops) >> +{ >> + mutex_lock(&link_ops_mutex); >> + if (link_ops_get(ops->type)) { >> + WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type); >> + goto out; >> + } >> + list_add(&ops->list, &link_ops); >> +out: >> + mutex_unlock(&link_ops_mutex); >> +} >> +EXPORT_SYMBOL(rdma_link_register); >> + >> +void rdma_link_unregister(struct rdma_link_ops *ops) >> +{ >> + mutex_lock(&link_ops_mutex); >> + list_del(&ops->list); >> + mutex_unlock(&link_ops_mutex); >> +} >> +EXPORT_SYMBOL(rdma_link_unregister); >> + >> +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, >> + struct netlink_ext_ack *extack) >> +{ >> + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; >> + char ibdev_name[IB_DEVICE_NAME_MAX]; >> + const struct rdma_link_ops *ops; >> + char ndev_name[IFNAMSIZ]; >> + char type[IFNAMSIZ]; >> + int err; >> + >> + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, >> + nldev_policy, extack); >> + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || >> + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || >> !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) >> + return -EINVAL; >> + >> + if (nla_len(tb[RDMA_NLDEV_ATTR_DEV_NAME]) > sizeof(ibdev_name) || >> + nla_len(tb[RDMA_NLDEV_ATTR_LINK_TYPE]) > sizeof(type) || >> + nla_len(tb[RDMA_NLDEV_ATTR_NDEV_NAME]) > sizeof(ndev_name)) >> + return -EINVAL; >> + >> + nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], >> + sizeof(ibdev_name)); >> + if (strchr(ibdev_name, '%')) >> + return -EINVAL; >> + >> + nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); >> + nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], >> + sizeof(ndev_name)); >> + >> + mutex_lock(&link_ops_mutex); >> + ops = link_ops_get(type); >> +#ifdef CONFIG_MODULES >> + if (!ops) { >> + mutex_unlock(&link_ops_mutex); >> + request_module("rdma-link-%s", type); >> + mutex_lock(&link_ops_mutex); >> + ops = link_ops_get(type); >> + } >> +#endif >> + if (ops) >> + err = ops->newlink(ibdev_name, ndev_name); >> + else >> + err = -ENODEV; >> + mutex_unlock(&link_ops_mutex); > Hi Steve, > > Should the mutex be held while calling the callback? If the newlink or dellink > callbacks need to sleep (probably dellink specifically), I think holding the mutex > might not be a good idea. > > Mike It is required to ensure the link_ops don't get unregistered while in use. See our previous discussion: https://www.spinics.net/lists/linux-rdma/msg70989.html Thanks, Steve.
On 11/29/2018 10:54 PM, Jason Gunthorpe wrote: > On Fri, Nov 30, 2018 at 01:29:22AM +0000, Ruhl, Michael J wrote: > >> Should the mutex be held while calling the callback? If the newlink or dellink >> callbacks need to sleep (probably dellink specifically), I think holding the mutex >> might not be a good idea. > It would be a bit better if this lock was a rwsem, but it must be held > whenever ops is touched.. This is probably a moot point though when > dellink goes through the ib_dev not the ops list.. > I'm not sure what you mean, but my current code under test keeps the rdma_link_ops pointer cached in the struct ib_device. Then nldev_dellink() looks like this: + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + mutex_lock(&link_ops_mutex); + + /* Deref the device before deleting it. Otherwise we + * deadlock unregistering the device. + */ + ib_device_put(device); + + if (device->link_ops) + err = device->link_ops->dellink(device); + else + err = -ENODEV; + + mutex_unlock(&link_ops_mutex); I still think the mutex is needed... Note I had to call ib_device_put() -before- calling the driver dellink function or we deadlocked doing the device unregister. The above code doesn't really look good. Any thoughts on a better way? Steve.
On Fri, Nov 30, 2018 at 09:40:52AM -0600, Steve Wise wrote: > > > On 11/29/2018 10:54 PM, Jason Gunthorpe wrote: > > On Fri, Nov 30, 2018 at 01:29:22AM +0000, Ruhl, Michael J wrote: > > > >> Should the mutex be held while calling the callback? If the newlink or dellink > >> callbacks need to sleep (probably dellink specifically), I think holding the mutex > >> might not be a good idea. > > It would be a bit better if this lock was a rwsem, but it must be held > > whenever ops is touched.. This is probably a moot point though when > > dellink goes through the ib_dev not the ops list.. > > > I'm not sure what you mean, but my current code under test keeps the > rdma_link_ops pointer cached in the struct ib_device. Then > nldev_dellink() looks like this: > > + device = ib_device_get_by_index(index); > + if (!device) > + return -EINVAL; > + > + mutex_lock(&link_ops_mutex); > + /* Deref the device before deleting it. Otherwise we > + * deadlock unregistering the device. > + */ > + ib_device_put(device); > + > + if (device->link_ops) > + err = device->link_ops->dellink(device); The driver must directly guarantee that the lifetime of everything under device, including link_ops, exists until ib_unregister_device() returns. For something like link_ops this happens without any extra effort. So the lock is not needed to keep link_ops alive. > I still think the mutex is needed... Note I had to call ib_device_put() > -before- calling the driver dellink function or we deadlocked doing the > device unregister. Ohh... this is all racy in rxe, and rxe has locking bugs :( rxe calls ib_unregister_device from module unload, sysfs/netlink and netdev notifiers and does nothing to guarentee ib_unregsiter_device() is called exactly once. ib_device_put like this is also racy as there is nothing holding the kref in this case, this could get the kref before putting I suppose, but then the core netlink code will try and put an already put device and blamo... So I think you need to add some way to tell netlink that this entire command should use kref not put ? and I don't entirely know how to fix everything wrong with rxe, but broadly - List manipulation of rxe_dev_list must always hold the dev_list_lock, at least rxe_notify doesn't - rxe_remove should do the list_del internally, and it should be approximately: spin_lock(&dev_list_lock); already_removed = list_empty(&rxe->list); list_del_init(&rxe->list); spin_unlock(&dev_list_lock); if (!already_removed) ibv_unregister_device(&rxe->ibdev); - Something needs to prevent module unload until all ib devices are unregistered. If the notifier already has internal locking then it probably makes sense to keep link_ops_mutex in the core code, but as a rwsem Jason
On 11/30/2018 10:27 AM, Jason Gunthorpe wrote: > On Fri, Nov 30, 2018 at 09:40:52AM -0600, Steve Wise wrote: >> >> On 11/29/2018 10:54 PM, Jason Gunthorpe wrote: >>> On Fri, Nov 30, 2018 at 01:29:22AM +0000, Ruhl, Michael J wrote: >>> >>>> Should the mutex be held while calling the callback? If the newlink or dellink >>>> callbacks need to sleep (probably dellink specifically), I think holding the mutex >>>> might not be a good idea. >>> It would be a bit better if this lock was a rwsem, but it must be held >>> whenever ops is touched.. This is probably a moot point though when >>> dellink goes through the ib_dev not the ops list.. >>> >> I'm not sure what you mean, but my current code under test keeps the >> rdma_link_ops pointer cached in the struct ib_device. Then >> nldev_dellink() looks like this: >> >> + device = ib_device_get_by_index(index); >> + if (!device) >> + return -EINVAL; >> + >> + mutex_lock(&link_ops_mutex); >> + /* Deref the device before deleting it. Otherwise we >> + * deadlock unregistering the device. >> + */ >> + ib_device_put(device); >> + >> + if (device->link_ops) >> + err = device->link_ops->dellink(device); > The driver must directly guarantee that the lifetime of everything > under device, including link_ops, exists until ib_unregister_device() > returns. > > For something like link_ops this happens without any extra effort. > > So the lock is not needed to keep link_ops alive. > >> I still think the mutex is needed... Note I had to call ib_device_put() >> -before- calling the driver dellink function or we deadlocked doing the >> device unregister. > Ohh... this is all racy in rxe, and rxe has locking bugs :( > > rxe calls ib_unregister_device from module unload, sysfs/netlink and > netdev notifiers and does nothing to guarentee ib_unregsiter_device() > is called exactly once. > > ib_device_put like this is also racy as there is nothing holding the > kref in this case, this could get the kref before putting I suppose, > but then the core netlink code will try and put an already put device > and blamo... So I think you need to add some way to tell netlink that > this entire command should use kref not put ? > > and I don't entirely know how to fix everything wrong with rxe, but > broadly > - List manipulation of rxe_dev_list must always hold the dev_list_lock, > at least rxe_notify doesn't > - rxe_remove should do the list_del internally, and it should be > approximately: > > spin_lock(&dev_list_lock); > already_removed = list_empty(&rxe->list); > list_del_init(&rxe->list); > spin_unlock(&dev_list_lock); > > if (!already_removed) > ibv_unregister_device(&rxe->ibdev); > > - Something needs to prevent module unload until all ib devices are > unregistered. If the notifier already has internal locking then it > probably makes sense to keep link_ops_mutex in the core code, but as > a rwsem > > Jason So perhaps nldev_dellink() should just lookup the ib_device by index using ib_device_get_by_index(), note the device name (copy it locally) and rdma_link_ops pointer, and call ib_device_put(). Then call the driver's dellink() function, passing the device name. Thus the driver can coordinate the multiple ways the device could be unregistered. An alternative that I discussed with Leon, was having nldev_dellink() just traverse all the registered rdma_link_ops and calling each dellink() function, passing the device name, until one succeeds or they all fail. Simple... I'm going to avoid fixing already-existing RXE bugs as part of this series if I can... Steve.
On Fri, Nov 30, 2018 at 02:07:52PM -0600, Steve Wise wrote: > So perhaps nldev_dellink() should just lookup the ib_device by index > using ib_device_get_by_index(), note the device name (copy it > locally) That just creates races with rename. The API to the driver should be pointer based and has to hold only the kref not the refcnt Jason
On 11/30/2018 2:12 PM, Jason Gunthorpe wrote: > On Fri, Nov 30, 2018 at 02:07:52PM -0600, Steve Wise wrote: > >> So perhaps nldev_dellink() should just lookup the ib_device by index >> using ib_device_get_by_index(), note the device name (copy it >> locally) > That just creates races with rename. > > The API to the driver should be pointer based and has to hold only the > kref not the refcnt > > Jason What kref? I don't see any kref in struct ib_device. /me confused... Steve
On Fri, Nov 30, 2018 at 02:18:00PM -0600, Steve Wise wrote: > > > On 11/30/2018 2:12 PM, Jason Gunthorpe wrote: > > On Fri, Nov 30, 2018 at 02:07:52PM -0600, Steve Wise wrote: > > > >> So perhaps nldev_dellink() should just lookup the ib_device by index > >> using ib_device_get_by_index(), note the device name (copy it > >> locally) > > That just creates races with rename. > > > > The API to the driver should be pointer based and has to hold only the > > kref not the refcnt > > > > Jason > > What kref? I don't see any kref in struct ib_device. /me confused... It is inside the embedded struct device Jason
On 11/30/2018 2:18 PM, Jason Gunthorpe wrote: > On Fri, Nov 30, 2018 at 02:18:00PM -0600, Steve Wise wrote: >> >> On 11/30/2018 2:12 PM, Jason Gunthorpe wrote: >>> On Fri, Nov 30, 2018 at 02:07:52PM -0600, Steve Wise wrote: >>> >>>> So perhaps nldev_dellink() should just lookup the ib_device by index >>>> using ib_device_get_by_index(), note the device name (copy it >>>> locally) >>> That just creates races with rename. >>> >>> The API to the driver should be pointer based and has to hold only the >>> kref not the refcnt >>> >>> Jason >> What kref? I don't see any kref in struct ib_device. /me confused... > It is inside the embedded struct device > > Jason How's this look? + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + ops = device->link_ops; + + /* + * Deref the ib_device before deleting it. Otherwise we + * deadlock unregistering the device. Hold a ref on the + * underlying dma_device though to keep the memory around + * until we're done. + */ + dma_device = get_device(device->dma_device); + ib_device_put(device); + err = ops ? ops->dellink(device) : -ENODEV; + put_device(dma_device); + + return err;
On Fri, Nov 30, 2018 at 02:42:39PM -0600, Steve Wise wrote: > > > On 11/30/2018 2:18 PM, Jason Gunthorpe wrote: > > On Fri, Nov 30, 2018 at 02:18:00PM -0600, Steve Wise wrote: > >> > >> On 11/30/2018 2:12 PM, Jason Gunthorpe wrote: > >>> On Fri, Nov 30, 2018 at 02:07:52PM -0600, Steve Wise wrote: > >>> > >>>> So perhaps nldev_dellink() should just lookup the ib_device by index > >>>> using ib_device_get_by_index(), note the device name (copy it > >>>> locally) > >>> That just creates races with rename. > >>> > >>> The API to the driver should be pointer based and has to hold only the > >>> kref not the refcnt > >>> > >>> Jason > >> What kref? I don't see any kref in struct ib_device. /me confused... > > It is inside the embedded struct device > > > > Jason > > How's this look? > > + device = ib_device_get_by_index(index); > + if (!device) > + return -EINVAL; > + > + ops = device->link_ops; > + > + /* > + * Deref the ib_device before deleting it. Otherwise we > + * deadlock unregistering the device. Hold a ref on the > + * underlying dma_device though to keep the memory around > + * until we're done. > + */ > + dma_device = get_device(device->dma_device); > + ib_device_put(device); > + err = ops ? ops->dellink(device) : -ENODEV; > + put_device(dma_device); This is OK I was thinking about the unload problem, and I wonder if the simple solution is to add something like ib_unregister_fence() which only returns once the device is unregistered and internaly waits for any other parrallel unregisters that may be running. Not sure the locking for that would be any fun though.. Jason
On 11/30/2018 2:52 PM, Jason Gunthorpe wrote: > > This is OK > > I was thinking about the unload problem, and I wonder if the simple > solution is to add something like ib_unregister_fence() which only > returns once the device is unregistered and internaly waits for any > other parrallel unregisters that may be running. Not sure the locking > for that would be any fun though.. > > Jason Can you expand on your thinking here? I'm not following.
On Mon, Dec 03, 2018 at 09:37:00AM -0600, Steve Wise wrote: > > > On 11/30/2018 2:52 PM, Jason Gunthorpe wrote: > > > > This is OK > > > > I was thinking about the unload problem, and I wonder if the simple > > solution is to add something like ib_unregister_fence() which only > > returns once the device is unregistered and internaly waits for any > > other parrallel unregisters that may be running. Not sure the locking > > for that would be any fun though.. > > > > Jason > > Can you expand on your thinking here? I'm not following. I'd have to really dig into it, I think. I'm looking at this and thinking the siw has the same bug classes, so maybe the core code should handle this simply somehow. Probably the list of rxe devices shouldn't even exist, and the core should have some helpers like 'unregsiter all devices of driver RXE' and 'find device of driver RXE' and then manage all this tricky locking internally That would be best, but I'm not really sure the exact details without trying to write it myself.. Jason
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 63cc74483188..04d8af0ccb25 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -33,6 +33,7 @@ #include <linux/module.h> #include <linux/pid.h> #include <linux/pid_namespace.h> +#include <linux/mutex.h> #include <net/netlink.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_netlink.h> @@ -107,6 +108,8 @@ [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -1103,6 +1106,126 @@ static int nldev_res_get_pd_dumpit(struct sk_buff *skb, return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD); } +static LIST_HEAD(link_ops); +static DEFINE_MUTEX(link_ops_mutex); + +static const struct rdma_link_ops *link_ops_get(const char *type) +{ + const struct rdma_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->type, type)) + goto out; + } + ops = NULL; +out: + return ops; +} + +void rdma_link_register(struct rdma_link_ops *ops) +{ + mutex_lock(&link_ops_mutex); + if (link_ops_get(ops->type)) { + WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type); + goto out; + } + list_add(&ops->list, &link_ops); +out: + mutex_unlock(&link_ops_mutex); +} +EXPORT_SYMBOL(rdma_link_register); + +void rdma_link_unregister(struct rdma_link_ops *ops) +{ + mutex_lock(&link_ops_mutex); + list_del(&ops->list); + mutex_unlock(&link_ops_mutex); +} +EXPORT_SYMBOL(rdma_link_unregister); + +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; + const struct rdma_link_ops *ops; + char ndev_name[IFNAMSIZ]; + char type[IFNAMSIZ]; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) + return -EINVAL; + + if (nla_len(tb[RDMA_NLDEV_ATTR_DEV_NAME]) > sizeof(ibdev_name) || + nla_len(tb[RDMA_NLDEV_ATTR_LINK_TYPE]) > sizeof(type) || + nla_len(tb[RDMA_NLDEV_ATTR_NDEV_NAME]) > sizeof(ndev_name)) + return -EINVAL; + + nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); + if (strchr(ibdev_name, '%')) + return -EINVAL; + + nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + sizeof(ndev_name)); + + mutex_lock(&link_ops_mutex); + ops = link_ops_get(type); +#ifdef CONFIG_MODULES + if (!ops) { + mutex_unlock(&link_ops_mutex); + request_module("rdma-link-%s", type); + mutex_lock(&link_ops_mutex); + ops = link_ops_get(type); + } +#endif + if (ops) + err = ops->newlink(ibdev_name, ndev_name); + else + err = -ENODEV; + mutex_unlock(&link_ops_mutex); + + return err; +} + +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; + const struct rdma_link_ops *ops; + char type[IFNAMSIZ]; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE]) + return -EINVAL; + + if (nla_len(tb[RDMA_NLDEV_ATTR_DEV_NAME]) > sizeof(ibdev_name) || + nla_len(tb[RDMA_NLDEV_ATTR_LINK_TYPE]) > sizeof(type)) + return -EINVAL; + + nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); + nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + + mutex_lock(&link_ops_mutex); + ops = link_ops_get(type); + if (ops) + err = ops->dellink(ibdev_name); + else + err = -ENODEV; + mutex_unlock(&link_ops_mutex); + + return err; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -1112,6 +1235,14 @@ static int nldev_res_get_pd_dumpit(struct sk_buff *skb, .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_NEWLINK] = { + .doit = nldev_newlink, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELLINK] = { + .doit = nldev_dellink, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 70218e6b5187..42daca1589bd 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -99,4 +99,16 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, * Returns true on success or false if no listeners. */ bool rdma_nl_chk_listeners(unsigned int group); + +struct rdma_link_ops { + struct list_head list; + const char *type; + int (*newlink)(const char *ibdev_name, const char *ndev_name); + int (*dellink)(const char *ibdev_name); +}; + +void rdma_link_register(struct rdma_link_ops *ops); +void rdma_link_unregister(struct rdma_link_ops *ops); + +#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type) #endif /* _RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index f9c41bf59efc..dfdfc2b608b8 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -229,9 +229,11 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_GET, /* can dump */ RDMA_NLDEV_CMD_SET, - /* 3 - 4 are free to use */ + RDMA_NLDEV_CMD_NEWLINK, - RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ + RDMA_NLDEV_CMD_DELLINK, + + RDMA_NLDEV_CMD_PORT_GET, /* can dump */ /* 6 - 8 are free to use */ @@ -428,6 +430,11 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ /* + * Identifies the rdma driver. eg: "rxe" or "siw" + */ + RDMA_NLDEV_ATTR_LINK_TYPE, /* string */ + + /* * Always the end */ RDMA_NLDEV_ATTR_MAX