Message ID | 531889e6a24f7919dec71734c91298d266aa9721.1517418595.git.swise@opengridcomputing.com (mailing list archive) |
---|---|
State | RFC |
Headers | show |
> -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Steve Wise > Sent: Tuesday, January 30, 2018 10:59 AM > To: dledford@redhat.com; Jason Gunthorpe <jgg@mellanox.com> > Cc: linux-rdma@vger.kernel.org; leon@kernel.org > Subject: [PATCH RFC 2/2] RDMA/nldev: provide detailed CM_ID information > > Implement RDMA nldev netlink interface to get detailed CM_ID information. > > Because cm_id's are attached to rdma devices in various work queue contexts, > the pid and task information at device-attach time is sometimes not useful. > For example, an nvme/f host connection ends up being bound to a device in a > work queue context and the resulting pid at attach time no longer exists after > connection setup. So instead we mark all cm_id's created via the rdma_ucm as > "user", and all others as "kernel". This required tweaking the restrack code a > little. It also required wrapping some rdma_cm functions to allow passing the > module name string. > > Signed-off-by: Steve Wise <swise@opengridcomputing.com> > --- > drivers/infiniband/core/cma.c | 55 ++++++--- > drivers/infiniband/core/nldev.c | 245 > +++++++++++++++++++++++++++++++++++++ > drivers/infiniband/core/restrack.c | 29 ++++- > drivers/infiniband/core/ucma.c | 8 +- > include/rdma/rdma_cm.h | 24 +++- > include/rdma/restrack.h | 4 + > include/uapi/rdma/rdma_netlink.h | 30 +++++ > 7 files changed, 365 insertions(+), 30 deletions(-) > > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index > 72ad52b..51fbfa1 100644 > --- a/drivers/infiniband/core/cma.c > +++ b/drivers/infiniband/core/cma.c > @@ -465,6 +465,9 @@ static void _cma_attach_to_dev(struct rdma_id_private > *id_priv, > id_priv->id.route.addr.dev_addr.transport = > rdma_node_get_transport(cma_dev->device->node_type); > list_add_tail(&id_priv->list, &cma_dev->id_list); > + id_priv->id.res.type = RDMA_RESTRACK_CM_ID; > + id_priv->id.res.kern_name = id_priv->id.caller; > + rdma_restrack_add(&id_priv->id.res); > } > > static void cma_attach_to_dev(struct rdma_id_private *id_priv, @@ -737,10 > +740,10 @@ static void cma_deref_id(struct rdma_id_private *id_priv) > complete(&id_priv->comp); > } > > -struct rdma_cm_id *rdma_create_id(struct net *net, > - rdma_cm_event_handler event_handler, > - void *context, enum rdma_port_space ps, > - enum ib_qp_type qp_type) > +struct rdma_cm_id *__rdma_create_id(struct net *net, > + rdma_cm_event_handler event_handler, > + void *context, enum rdma_port_space ps, > + enum ib_qp_type qp_type, const char *caller) > { > struct rdma_id_private *id_priv; > > @@ -748,7 +751,10 @@ struct rdma_cm_id *rdma_create_id(struct net *net, > if (!id_priv) > return ERR_PTR(-ENOMEM); > > - id_priv->owner = task_pid_nr(current); > + if (caller) > + id_priv->id.caller = caller; > + else > + id_priv->owner = task_pid_nr(current); > id_priv->state = RDMA_CM_IDLE; > id_priv->id.context = context; > id_priv->id.event_handler = event_handler; @@ -768,7 +774,7 @@ > struct rdma_cm_id *rdma_create_id(struct net *net, > > return &id_priv->id; > } > -EXPORT_SYMBOL(rdma_create_id); > +EXPORT_SYMBOL(__rdma_create_id); > > static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { > @@ -1628,6 +1634,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) > mutex_unlock(&id_priv->handler_mutex); > > if (id_priv->cma_dev) { > + rdma_restrack_del(&id_priv->id.res); > if (rdma_cap_ib_cm(id_priv->id.device, 1)) { > if (id_priv->cm_id.ib) > ib_destroy_cm_id(id_priv->cm_id.ib); > @@ -1786,9 +1793,10 @@ static struct rdma_id_private > *cma_new_conn_id(struct rdma_cm_id *listen_id, > ib_event->param.req_rcvd.primary_path->service_id; > int ret; > > - id = rdma_create_id(listen_id->route.addr.dev_addr.net, > + id = __rdma_create_id(listen_id->route.addr.dev_addr.net, > listen_id->event_handler, listen_id->context, > - listen_id->ps, ib_event->param.req_rcvd.qp_type); > + listen_id->ps, ib_event->param.req_rcvd.qp_type, > + listen_id->caller); > if (IS_ERR(id)) > return NULL; > > @@ -1843,8 +1851,8 @@ static struct rdma_id_private > *cma_new_udp_id(struct rdma_cm_id *listen_id, > struct net *net = listen_id->route.addr.dev_addr.net; > int ret; > > - id = rdma_create_id(net, listen_id->event_handler, listen_id->context, > - listen_id->ps, IB_QPT_UD); > + id = __rdma_create_id(net, listen_id->event_handler, listen_id- > >context, > + listen_id->ps, IB_QPT_UD, listen_id->caller); > if (IS_ERR(id)) > return NULL; > > @@ -2110,10 +2118,11 @@ static int iw_conn_req_handler(struct iw_cm_id > *cm_id, > goto out; > > /* Create a new RDMA id for the new IW CM ID */ > - new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, > - listen_id->id.event_handler, > - listen_id->id.context, > - RDMA_PS_TCP, IB_QPT_RC); > + new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, > + listen_id->id.event_handler, > + listen_id->id.context, > + RDMA_PS_TCP, IB_QPT_RC, > + listen_id->id.caller); > if (IS_ERR(new_cm_id)) { > ret = -ENOMEM; > goto out; > @@ -2238,8 +2247,8 @@ static void cma_listen_on_dev(struct rdma_id_private > *id_priv, > if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev- > >device, 1)) > return; > > - id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, > - id_priv->id.qp_type); > + id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, > + id_priv->id.qp_type, id_priv->id.caller); > if (IS_ERR(id)) > return; > > @@ -3347,8 +3356,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct > sockaddr *addr) > > return 0; > err2: > - if (id_priv->cma_dev) > + if (id_priv->cma_dev) { > + rdma_restrack_del(&id_priv->id.res); > cma_release_dev(id_priv); > + } > err1: > cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); > return ret; > @@ -3731,14 +3742,18 @@ static int cma_send_sidr_rep(struct > rdma_id_private *id_priv, > return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } > > -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > *conn_param) > +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > *conn_param, > + const char *caller) > { > struct rdma_id_private *id_priv; > int ret; > > id_priv = container_of(id, struct rdma_id_private, id); > > - id_priv->owner = task_pid_nr(current); > + if (caller) > + id_priv->id.caller = caller; > + else > + id_priv->owner = task_pid_nr(current); > > if (!cma_comp(id_priv, RDMA_CM_CONNECT)) > return -EINVAL; > @@ -3778,7 +3793,7 @@ int rdma_accept(struct rdma_cm_id *id, struct > rdma_conn_param *conn_param) > rdma_reject(id, NULL, 0); > return ret; > } > -EXPORT_SYMBOL(rdma_accept); > +EXPORT_SYMBOL(__rdma_accept); > > int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { diff --git > a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index > fa8655e..a4091b5 100644 > --- a/drivers/infiniband/core/nldev.c > +++ b/drivers/infiniband/core/nldev.c > @@ -34,6 +34,7 @@ > #include <linux/pid.h> > #include <linux/pid_namespace.h> > #include <net/netlink.h> > +#include <rdma/rdma_cm.h> > #include <rdma/rdma_netlink.h> > > #include "core_priv.h" > @@ -71,6 +72,22 @@ > [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, > [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = > NLA_NUL_STRING, > .len = TASK_COMM_LEN }, > + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = > NLA_NESTED }, > + [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = > NLA_NESTED }, > + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, > + [RDMA_NLDEV_ATTR_RES_IPV4_SADDR] = { > + .len = FIELD_SIZEOF(struct iphdr, saddr) }, > + [RDMA_NLDEV_ATTR_RES_IPV4_DADDR] = { > + .len = FIELD_SIZEOF(struct iphdr, saddr) }, > + [RDMA_NLDEV_ATTR_RES_IPV6_SADDR] = { > + .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, > + [RDMA_NLDEV_ATTR_RES_IPV6_DADDR] = { > + .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, > + [RDMA_NLDEV_ATTR_RES_IP_SPORT] = { .type = NLA_U16 }, > + [RDMA_NLDEV_ATTR_RES_IP_DPORT] = { .type = NLA_U16 }, > + [RDMA_NLDEV_ATTR_RES_DEV_TYPE] = { .type = NLA_U8 }, > + [RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE] = { .type = NLA_U8 }, > + [RDMA_NLDEV_ATTR_RES_NETWORK_TYPE] = { .type = NLA_U8 }, > }; > > static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ - > 182,6 +199,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device > *device) > [RDMA_RESTRACK_PD] = "pd", > [RDMA_RESTRACK_CQ] = "cq", > [RDMA_RESTRACK_QP] = "qp", > + [RDMA_RESTRACK_CM_ID] = "cm_id", May be rdmacm_id a better name to avoid confusion with ib_cm/iw_cm ids? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > 182,6 +199,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device > > *device) > > [RDMA_RESTRACK_PD] = "pd", > > [RDMA_RESTRACK_CQ] = "cq", > > [RDMA_RESTRACK_QP] = "qp", > > + [RDMA_RESTRACK_CM_ID] = "cm_id", > > May be rdmacm_id a better name to avoid confusion with ib_cm/iw_cm ids? Eventually, I could see the ib_cm and/or iw_cm info also getting dumped as part of this. So perhaps the name "cm_id" is ok? I prefer it also because its shorter (lazy typer! :) ) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Steve Wise > Sent: Wednesday, January 31, 2018 2:56 PM > To: Parav Pandit <parav@mellanox.com>; dledford@redhat.com; Jason > Gunthorpe <jgg@mellanox.com> > Cc: linux-rdma@vger.kernel.org; leon@kernel.org > Subject: RE: [PATCH RFC 2/2] RDMA/nldev: provide detailed CM_ID information > > > > 182,6 +199,7 @@ static int fill_res_info(struct sk_buff *msg, struct > ib_device > > > *device) > > > [RDMA_RESTRACK_PD] = "pd", > > > [RDMA_RESTRACK_CQ] = "cq", > > > [RDMA_RESTRACK_QP] = "qp", > > > + [RDMA_RESTRACK_CM_ID] = "cm_id", > > > > May be rdmacm_id a better name to avoid confusion with ib_cm/iw_cm ids? > > Eventually, I could see the ib_cm and/or iw_cm info also getting dumped as part Yes. But it is still coming through id of rdmacm_id as transport id information. > of this. So perhaps the name "cm_id" is ok? It's just that longer name removes the ambiguity. I am not too particular about this particular naming. > I prefer it also because its shorter (lazy typer! :) ) Sure, shorter names are useful to end user. But I think this is handled well at iproute2 tool level. For example below two commands achieves the same task. #ip address show #ip a s So may be shorter naming at command level is better left at user space tool level. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 31, 2018 at 09:18:39PM +0000, Parav Pandit wrote: > > > > -----Original Message----- > > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > > owner@vger.kernel.org] On Behalf Of Steve Wise > > Sent: Wednesday, January 31, 2018 2:56 PM > > To: Parav Pandit <parav@mellanox.com>; dledford@redhat.com; Jason > > Gunthorpe <jgg@mellanox.com> > > Cc: linux-rdma@vger.kernel.org; leon@kernel.org > > Subject: RE: [PATCH RFC 2/2] RDMA/nldev: provide detailed CM_ID information > > > > > > 182,6 +199,7 @@ static int fill_res_info(struct sk_buff *msg, struct > > ib_device > > > > *device) > > > > [RDMA_RESTRACK_PD] = "pd", > > > > [RDMA_RESTRACK_CQ] = "cq", > > > > [RDMA_RESTRACK_QP] = "qp", > > > > + [RDMA_RESTRACK_CM_ID] = "cm_id", > > > > > > May be rdmacm_id a better name to avoid confusion with ib_cm/iw_cm ids? > > > > Eventually, I could see the ib_cm and/or iw_cm info also getting dumped as part > Yes. But it is still coming through id of rdmacm_id as transport id information. > > of this. So perhaps the name "cm_id" is ok? > It's just that longer name removes the ambiguity. > I am not too particular about this particular naming. > > > I prefer it also because its shorter (lazy typer! :) ) > Sure, shorter names are useful to end user. > But I think this is handled well at iproute2 tool level. > For example below two commands achieves the same task. > #ip address show > #ip a s > So may be shorter naming at command level is better left at user space tool level. The shorter naming is working on commands and not on objects. In rdamtool, you can write "rdma d s" instead of "rdma dev show", but the name of device (mlx5_0) or object (qp) should be full. Regarding the name, I personally think that cm_id is better because it is general and applicable both to ib_cm and iw_cm. The separation between them can be done with introduction of new netlink attribute, e.g. cm_id_type. Thanks > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 30, 2018 at 08:59:11AM -0800, Steve Wise wrote: > Implement RDMA nldev netlink interface to get detailed CM_ID information. > > Because cm_id's are attached to rdma devices in various work queue contexts, > the pid and task information at device-attach time is sometimes not useful. > For example, an nvme/f host connection ends up being bound to a device > in a work queue context and the resulting pid at attach time no longer exists > after connection setup. So instead we mark all cm_id's created via the > rdma_ucm as "user", and all others as "kernel". This required tweaking > the restrack code a little. It also required wrapping some rdma_cm > functions to allow passing the module name string. > > Signed-off-by: Steve Wise <swise@opengridcomputing.com> > --- > drivers/infiniband/core/cma.c | 55 ++++++--- > drivers/infiniband/core/nldev.c | 245 +++++++++++++++++++++++++++++++++++++ > drivers/infiniband/core/restrack.c | 29 ++++- > drivers/infiniband/core/ucma.c | 8 +- > include/rdma/rdma_cm.h | 24 +++- > include/rdma/restrack.h | 4 + > include/uapi/rdma/rdma_netlink.h | 30 +++++ > 7 files changed, 365 insertions(+), 30 deletions(-) > > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c > index 72ad52b..51fbfa1 100644 > --- a/drivers/infiniband/core/cma.c > +++ b/drivers/infiniband/core/cma.c > @@ -465,6 +465,9 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv, > id_priv->id.route.addr.dev_addr.transport = > rdma_node_get_transport(cma_dev->device->node_type); > list_add_tail(&id_priv->list, &cma_dev->id_list); > + id_priv->id.res.type = RDMA_RESTRACK_CM_ID; > + id_priv->id.res.kern_name = id_priv->id.caller; Steve, I don't like it, I worked hard to hide it from the users of restrack, and don't see reason why the same trick as with ib_create_cq/ib_create_pd won't work here. > + rdma_restrack_add(&id_priv->id.res); > } > > static void cma_attach_to_dev(struct rdma_id_private *id_priv, > @@ -737,10 +740,10 @@ static void cma_deref_id(struct rdma_id_private *id_priv) > complete(&id_priv->comp); > } > > -struct rdma_cm_id *rdma_create_id(struct net *net, > - rdma_cm_event_handler event_handler, > - void *context, enum rdma_port_space ps, > - enum ib_qp_type qp_type) > +struct rdma_cm_id *__rdma_create_id(struct net *net, > + rdma_cm_event_handler event_handler, > + void *context, enum rdma_port_space ps, > + enum ib_qp_type qp_type, const char *caller) > { > struct rdma_id_private *id_priv; > > @@ -748,7 +751,10 @@ struct rdma_cm_id *rdma_create_id(struct net *net, > if (!id_priv) > return ERR_PTR(-ENOMEM); > > - id_priv->owner = task_pid_nr(current); > + if (caller) > + id_priv->id.caller = caller; > + else > + id_priv->owner = task_pid_nr(current); See the comment above > id_priv->state = RDMA_CM_IDLE; > id_priv->id.context = context; > id_priv->id.event_handler = event_handler; Not saying that we need to do it now, but it is important to write, most probably we can drop certain initialization from rdma_create_id() adn reuse rdma_restrack_put/_get. > @@ -768,7 +774,7 @@ struct rdma_cm_id *rdma_create_id(struct net *net, > > return &id_priv->id; > } > -EXPORT_SYMBOL(rdma_create_id); > +EXPORT_SYMBOL(__rdma_create_id); > > static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) > { > @@ -1628,6 +1634,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) > mutex_unlock(&id_priv->handler_mutex); > > if (id_priv->cma_dev) { > + rdma_restrack_del(&id_priv->id.res); You should count all created cm_ids and not only binded. > if (rdma_cap_ib_cm(id_priv->id.device, 1)) { > if (id_priv->cm_id.ib) > ib_destroy_cm_id(id_priv->cm_id.ib); > @@ -1786,9 +1793,10 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, > ib_event->param.req_rcvd.primary_path->service_id; > int ret; > > - id = rdma_create_id(listen_id->route.addr.dev_addr.net, > + id = __rdma_create_id(listen_id->route.addr.dev_addr.net, > listen_id->event_handler, listen_id->context, > - listen_id->ps, ib_event->param.req_rcvd.qp_type); > + listen_id->ps, ib_event->param.req_rcvd.qp_type, > + listen_id->caller); I think the cleanest way will be to create some struct and pass pointer to it so you can unfold all relevant data inside of __rdma_create_id(). > if (IS_ERR(id)) > return NULL; > > @@ -1843,8 +1851,8 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, > struct net *net = listen_id->route.addr.dev_addr.net; > int ret; > > - id = rdma_create_id(net, listen_id->event_handler, listen_id->context, > - listen_id->ps, IB_QPT_UD); > + id = __rdma_create_id(net, listen_id->event_handler, listen_id->context, > + listen_id->ps, IB_QPT_UD, listen_id->caller); > if (IS_ERR(id)) > return NULL; > > @@ -2110,10 +2118,11 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, > goto out; > > /* Create a new RDMA id for the new IW CM ID */ > - new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, > - listen_id->id.event_handler, > - listen_id->id.context, > - RDMA_PS_TCP, IB_QPT_RC); > + new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, > + listen_id->id.event_handler, > + listen_id->id.context, > + RDMA_PS_TCP, IB_QPT_RC, > + listen_id->id.caller); > if (IS_ERR(new_cm_id)) { > ret = -ENOMEM; > goto out; > @@ -2238,8 +2247,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, > if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) > return; > > - id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, > - id_priv->id.qp_type); > + id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, > + id_priv->id.qp_type, id_priv->id.caller); > if (IS_ERR(id)) > return; > > @@ -3347,8 +3356,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) > > return 0; > err2: > - if (id_priv->cma_dev) > + if (id_priv->cma_dev) { > + rdma_restrack_del(&id_priv->id.res); > cma_release_dev(id_priv); > + } > err1: > cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); > return ret; > @@ -3731,14 +3742,18 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, > return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); > } > > -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) > +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, > + const char *caller) > { > struct rdma_id_private *id_priv; > int ret; > > id_priv = container_of(id, struct rdma_id_private, id); > > - id_priv->owner = task_pid_nr(current); > + if (caller) > + id_priv->id.caller = caller; > + else > + id_priv->owner = task_pid_nr(current); > > if (!cma_comp(id_priv, RDMA_CM_CONNECT)) > return -EINVAL; > @@ -3778,7 +3793,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) > rdma_reject(id, NULL, 0); > return ret; > } > -EXPORT_SYMBOL(rdma_accept); > +EXPORT_SYMBOL(__rdma_accept); > > int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) > { > diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c > index fa8655e..a4091b5 100644 > --- a/drivers/infiniband/core/nldev.c > +++ b/drivers/infiniband/core/nldev.c > @@ -34,6 +34,7 @@ > #include <linux/pid.h> > #include <linux/pid_namespace.h> > #include <net/netlink.h> > +#include <rdma/rdma_cm.h> > #include <rdma/rdma_netlink.h> > > #include "core_priv.h" > @@ -71,6 +72,22 @@ > [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, > [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, > .len = TASK_COMM_LEN }, > + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, I would like to use this opportunity. There is CM_ID, so users will be able to query nldev directly on this ID (once we implement .doit), but can we found a proper abstraction for other objects (PD, CQ, QP e.t.c.)? I want to have that all resources will have something similar to ifindex/ibindex. > + [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, > + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, > + [RDMA_NLDEV_ATTR_RES_IPV4_SADDR] = { > + .len = FIELD_SIZEOF(struct iphdr, saddr) }, > + [RDMA_NLDEV_ATTR_RES_IPV4_DADDR] = { > + .len = FIELD_SIZEOF(struct iphdr, saddr) }, > + [RDMA_NLDEV_ATTR_RES_IPV6_SADDR] = { > + .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, > + [RDMA_NLDEV_ATTR_RES_IPV6_DADDR] = { > + .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, > + [RDMA_NLDEV_ATTR_RES_IP_SPORT] = { .type = NLA_U16 }, > + [RDMA_NLDEV_ATTR_RES_IP_DPORT] = { .type = NLA_U16 }, > + [RDMA_NLDEV_ATTR_RES_DEV_TYPE] = { .type = NLA_U8 }, > + [RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE] = { .type = NLA_U8 }, > + [RDMA_NLDEV_ATTR_RES_NETWORK_TYPE] = { .type = NLA_U8 }, > }; > > static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) > @@ -182,6 +199,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) > [RDMA_RESTRACK_PD] = "pd", > [RDMA_RESTRACK_CQ] = "cq", > [RDMA_RESTRACK_QP] = "qp", > + [RDMA_RESTRACK_CM_ID] = "cm_id", > }; > > struct rdma_restrack_root *res = &device->res; > @@ -284,6 +302,99 @@ static int fill_res_qp_entry(struct sk_buff *msg, > return -EMSGSIZE; > } > > +static int fill_res_cm_id_entry(struct sk_buff *msg, > + struct rdma_cm_id *cm_id, uint32_t port) > +{ > + struct rdma_id_private *id_priv; > + struct nlattr *entry_attr; > + > + if (port && port != cm_id->port_num) > + return 0; > + > + id_priv = container_of(cm_id, struct rdma_id_private, id); > + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY); > + if (!entry_attr) > + goto out; > + > + if (cm_id->port_num && > + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) > + goto err; > + > + if (id_priv->qp_num && > + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num)) > + goto err; > + > + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps)) > + goto err; > + > + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type)) > + goto err; > + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state)) > + goto err; > + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_DEV_TYPE, > + id_priv->id.route.addr.dev_addr.dev_type)) > + goto err; > + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE, > + id_priv->id.route.addr.dev_addr.transport)) > + goto err; > + > + if (cm_id->route.addr.src_addr.ss_family == AF_INET) { > + struct sockaddr_in *sin; > + > + sin = (struct sockaddr_in *)&cm_id->route.addr.src_addr; > + if (nla_put_in_addr(msg, RDMA_NLDEV_ATTR_RES_IPV4_SADDR, > + sin->sin_addr.s_addr)) > + goto err; > + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_SPORT, > + sin->sin_port)) > + goto err; > + > + sin = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; > + if (nla_put_in_addr(msg, RDMA_NLDEV_ATTR_RES_IPV4_DADDR, > + sin->sin_addr.s_addr)) > + goto err; > + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_DPORT, > + sin->sin_port)) > + goto err; > + } else { > + struct sockaddr_in6 *sin6; > + > + sin6 = (struct sockaddr_in6 *)&cm_id->route.addr.src_addr; > + if (nla_put_in6_addr(msg, RDMA_NLDEV_ATTR_RES_IPV6_SADDR, > + &sin6->sin6_addr)) > + goto err; > + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_SPORT, > + sin6->sin6_port)) > + goto err; > + > + sin6 = (struct sockaddr_in6 *)&cm_id->route.addr.dst_addr; > + if (nla_put_in6_addr(msg, RDMA_NLDEV_ATTR_RES_IPV6_DADDR, > + &sin6->sin6_addr)) > + goto err; > + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_DPORT, > + sin6->sin6_port)) > + goto err; > + } > + > + if (id_priv->id.caller) { > + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, > + id_priv->id.caller)) > + goto err; > + } else { > + /* CMA keeps the owning pid. */ > + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, id_priv->owner)) > + goto err; > + } > + > + nla_nest_end(msg, entry_attr); > + return 0; > + > +err: > + nla_nest_cancel(msg, entry_attr); > +out: > + return -EMSGSIZE; > +} > + > static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, > struct netlink_ext_ack *extack) > { > @@ -686,6 +797,137 @@ static int nldev_res_get_qp_dumpit(struct sk_buff *skb, > return ret; > } > > +static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb, > + struct netlink_callback *cb) > +{ > + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; > + struct rdma_restrack_entry *res; > + int err, ret = 0, idx = 0; > + struct nlattr *table_attr; > + struct ib_device *device; > + int start = cb->args[0]; > + struct rdma_cm_id *cm_id = NULL; > + struct nlmsghdr *nlh; > + u32 index, port = 0; > + > + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, > + nldev_policy, NULL); > + /* > + * Right now, we are expecting the device index to get QP information, > + * but it is possible to extend this code to return all devices in > + * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. > + * if it doesn't exist, we will iterate over all devices. > + * > + * But it is not needed for now. > + */ > + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) > + return -EINVAL; > + > + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); > + device = ib_device_get_by_index(index); > + if (!device) > + return -EINVAL; > + > + /* > + * If no PORT_INDEX is supplied, we will return all QPs from that device QPs->CM_IDs > + */ > + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { > + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); > + if (!rdma_is_port_valid(device, port)) { > + ret = -EINVAL; > + goto err_index; > + } > + } > + > + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_QP_GET), RDMA_NLDEV_CMD_RES_QP_GET -> RDMA_NLDEV_CMD_RES_CM_ID_GET > + 0, NLM_F_MULTI); > + > + if (fill_nldev_handle(skb, device)) { > + ret = -EMSGSIZE; > + goto err; > + } > + > + table_attr = nla_nest_start(skb, RDMA_NLDEV_ATTR_RES_CM_ID); > + if (!table_attr) { > + ret = -EMSGSIZE; > + goto err; > + } > + > + down_read(&device->res.rwsem); > + hash_for_each_possible(device->res.hash, res, node, > + RDMA_RESTRACK_CM_ID) { > + if (idx < start) > + goto next; > + > + if ((rdma_is_kernel_res(res) && > + task_active_pid_ns(current) != &init_pid_ns) || > + (!rdma_is_kernel_res(res) && > + task_active_pid_ns(current) != > + task_active_pid_ns(res->task))) > + /* > + * 1. Kernel QPs should be visible in init namsapce only > + * 2. Preent only QPs visible in the current namespace > + */ > + goto next; > + > + if (!rdma_restrack_get(res)) > + /* > + * Resource is under release now, but we are not > + * relesing lock now, so it will be released in > + * our next pass, once we will get ->next pointer. > + */ > + goto next; > + > + cm_id = container_of(res, struct rdma_cm_id, res); > + > + up_read(&device->res.rwsem); > + ret = fill_res_cm_id_entry(skb, cm_id, port); > + down_read(&device->res.rwsem); > + /* > + * Return resource back, but it won't be released till > + * the &device->res.rwsem will be released for write. > + */ > + rdma_restrack_put(res); > + > + if (ret == -EMSGSIZE) > + /* > + * There is a chance to optimize here. > + * It can be done by using list_prepare_entry > + * and list_for_each_entry_continue afterwards. > + */ > + break; > + if (ret) > + goto res_err; > +next: idx++; > + } > + up_read(&device->res.rwsem); > + > + nla_nest_end(skb, table_attr); > + nlmsg_end(skb, nlh); > + cb->args[0] = idx; > + > + /* > + * No more CM_IDs to fill, cancel the message and > + * return 0 to mark end of dumpit. > + */ > + if (!cm_id) > + goto err; > + > + put_device(&device->dev); > + return skb->len; > + > +res_err: > + nla_nest_cancel(skb, table_attr); > + up_read(&device->res.rwsem); > + > +err: > + nlmsg_cancel(skb, nlh); > + > +err_index: > + put_device(&device->dev); > + return ret; > +} > static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { > [RDMA_NLDEV_CMD_GET] = { > .doit = nldev_get_doit, > @@ -712,6 +954,9 @@ static int nldev_res_get_qp_dumpit(struct sk_buff *skb, > * too. > */ > }, > + [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { > + .dump = nldev_res_get_cm_id_dumpit, > + }, > }; > > void __init nldev_init(void) > diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c > index 857637b..bb13169 100644 > --- a/drivers/infiniband/core/restrack.c > +++ b/drivers/infiniband/core/restrack.c > @@ -3,6 +3,7 @@ > * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. > */ > > +#include <rdma/rdma_cm.h> > #include <rdma/ib_verbs.h> > #include <rdma/restrack.h> > #include <linux/mutex.h> > @@ -67,6 +68,7 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) > struct ib_pd *pd; > struct ib_cq *cq; > struct ib_qp *qp; > + struct rdma_cm_id *cm_id; > > switch (type) { > case RDMA_RESTRACK_PD: > @@ -85,6 +87,10 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) > xrcd = container_of(res, struct ib_xrcd, res); > dev = xrcd->device; > break; > + case RDMA_RESTRACK_CM_ID: > + cm_id = container_of(res, struct rdma_cm_id, res); > + dev = cm_id->device; > + break; > default: > WARN_ONCE(true, "Wrong resource tracking type %u\n", type); > return NULL; > @@ -93,6 +99,27 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) > return dev; > } > > +static bool res_is_user(struct rdma_restrack_entry *res) > +{ > + enum rdma_restrack_type type = res->type; > + bool is_user; > + > + switch (type) { > + case RDMA_RESTRACK_CM_ID: { > + struct rdma_cm_id *cm_id; > + > + cm_id = container_of(res, struct rdma_cm_id, res); > + is_user = !cm_id->caller; > + break; > + } > + default: > + is_user = !uaccess_kernel(); > + break; > + } > + > + return is_user; > +} > + > void rdma_restrack_add(struct rdma_restrack_entry *res) > { > struct ib_device *dev = res_to_dev(res); > @@ -100,7 +127,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) > if (!dev) > return; > > - if (!uaccess_kernel()) { > + if (res_is_user(res)) { > get_task_struct(current); > res->task = current; > res->kern_name = NULL; > diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c > index d67219d..f7f0282 100644 > --- a/drivers/infiniband/core/ucma.c > +++ b/drivers/infiniband/core/ucma.c > @@ -476,8 +476,8 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, > return -ENOMEM; > > ctx->uid = cmd.uid; > - ctx->cm_id = rdma_create_id(current->nsproxy->net_ns, > - ucma_event_handler, ctx, cmd.ps, qp_type); > + ctx->cm_id = __rdma_create_id(current->nsproxy->net_ns, > + ucma_event_handler, ctx, cmd.ps, qp_type, NULL); > if (IS_ERR(ctx->cm_id)) { > ret = PTR_ERR(ctx->cm_id); > goto err1; > @@ -1084,12 +1084,12 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, > if (cmd.conn_param.valid) { > ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); > mutex_lock(&file->mut); > - ret = rdma_accept(ctx->cm_id, &conn_param); > + ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); > if (!ret) > ctx->uid = cmd.uid; > mutex_unlock(&file->mut); > } else > - ret = rdma_accept(ctx->cm_id, NULL); > + ret = __rdma_accept(ctx->cm_id, NULL, NULL); > > ucma_put_ctx(ctx); > return ret; > diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h > index 5984225..73dac9b 100644 > --- a/include/rdma/rdma_cm.h > +++ b/include/rdma/rdma_cm.h > @@ -155,6 +155,12 @@ struct rdma_cm_id { > enum rdma_port_space ps; > enum ib_qp_type qp_type; > u8 port_num; > + const char *caller; > + > + /* > + * Internal to RDMA/core, don't use in the drivers > + */ > + struct rdma_restrack_entry res; > }; > > struct rdma_id_private { > @@ -198,6 +204,11 @@ struct rdma_id_private { > enum ib_gid_type gid_type; > }; > > +struct rdma_cm_id *__rdma_create_id(struct net *net, > + rdma_cm_event_handler event_handler, > + void *context, enum rdma_port_space ps, > + enum ib_qp_type qp_type, const char *caller); > + > /** > * rdma_create_id - Create an RDMA identifier. > * > @@ -210,10 +221,9 @@ struct rdma_id_private { > * > * The id holds a reference on the network namespace until it is destroyed. > */ > -struct rdma_cm_id *rdma_create_id(struct net *net, > - rdma_cm_event_handler event_handler, > - void *context, enum rdma_port_space ps, > - enum ib_qp_type qp_type); > +#define rdma_create_id(net, event_handler, context, ps, qp_type) \ > + __rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \ > + KBUILD_MODNAME) > > /** > * rdma_destroy_id - Destroys an RDMA identifier. > @@ -325,6 +335,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, > */ > int rdma_listen(struct rdma_cm_id *id, int backlog); > > +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, > + const char *caller); > + > /** > * rdma_accept - Called to accept a connection request or response. > * @id: Connection identifier associated with the request. > @@ -340,7 +353,8 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, > * state of the qp associated with the id is modified to error, such that any > * previously posted receive buffers would be flushed. > */ > -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); > +#define rdma_accept(id, conn_param) \ > + __rdma_accept((id), (conn_param), KBUILD_MODNAME) > > /** > * rdma_notify - Notifies the RDMA CM of an asynchronous event that has > diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h > index c2d8116..a794e0e 100644 > --- a/include/rdma/restrack.h > +++ b/include/rdma/restrack.h > @@ -33,6 +33,10 @@ enum rdma_restrack_type { > */ > RDMA_RESTRACK_XRCD, > /** > + * @RDMA_RESTRACK_CM_ID: Connection Manager ID (CM_ID) > + */ > + RDMA_RESTRACK_CM_ID, > + /** > * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations > */ > RDMA_RESTRACK_MAX > diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h > index 17e59be..4ed21ee 100644 > --- a/include/uapi/rdma/rdma_netlink.h > +++ b/include/uapi/rdma/rdma_netlink.h > @@ -240,6 +240,8 @@ enum rdma_nldev_command { > > RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */ > > + RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */ > + > RDMA_NLDEV_NUM_OPS > }; > > @@ -352,6 +354,34 @@ enum rdma_nldev_attr { > */ > RDMA_NLDEV_ATTR_RES_KERN_NAME, /* string */ > > + RDMA_NLDEV_ATTR_RES_CM_ID, /* nested table */ > + RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, /* nested table */ > + /* > + * rdma_cm_id port space. > + */ > + RDMA_NLDEV_ATTR_RES_PS, /* u32 */ > + /* > + * IP Addresses/port attributes. > + */ > + RDMA_NLDEV_ATTR_RES_IPV4_SADDR, /* u8[4] */ > + RDMA_NLDEV_ATTR_RES_IPV4_DADDR, /* u8[4] */ > + RDMA_NLDEV_ATTR_RES_IPV6_SADDR, /* u8[16] */ > + RDMA_NLDEV_ATTR_RES_IPV6_DADDR, /* u8[16] */ > + RDMA_NLDEV_ATTR_RES_IP_SPORT, /* BE u16 */ > + RDMA_NLDEV_ATTR_RES_IP_DPORT, /* BE u16 */ Can you please document the meaning of S (source) and D (destination) in regards of this netlink output? It is needed to remove ambiguity. > + /* > + * ARPHRD_INFINIBAND, ARPHRD_ETHER, ... > + */ > + RDMA_NLDEV_ATTR_RES_DEV_TYPE, /* u8 */ > + /* > + * enum enum rdma_transport_type (IB, IWARP, ...) > + */ > + RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE, /* u8 */ > + /* > + * enum rdma_network_type (IB, IPv4, IPv6,...) > + */ > + RDMA_NLDEV_ATTR_RES_NETWORK_TYPE, /* u8 */ > + > RDMA_NLDEV_ATTR_MAX > }; > #endif /* _UAPI_RDMA_NETLINK_H */ > -- > 1.8.3.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On Tue, Jan 30, 2018 at 08:59:11AM -0800, Steve Wise wrote: > > Implement RDMA nldev netlink interface to get detailed CM_ID information. > > > > Because cm_id's are attached to rdma devices in various work queue contexts, > > the pid and task information at device-attach time is sometimes not useful. > > For example, an nvme/f host connection ends up being bound to a device > > in a work queue context and the resulting pid at attach time no longer exists > > after connection setup. So instead we mark all cm_id's created via the > > rdma_ucm as "user", and all others as "kernel". This required tweaking > > the restrack code a little. It also required wrapping some rdma_cm > > functions to allow passing the module name string. > > > > Signed-off-by: Steve Wise <swise@opengridcomputing.com> > > --- > > drivers/infiniband/core/cma.c | 55 ++++++--- > > drivers/infiniband/core/nldev.c | 245 > +++++++++++++++++++++++++++++++++++++ > > drivers/infiniband/core/restrack.c | 29 ++++- > > drivers/infiniband/core/ucma.c | 8 +- > > include/rdma/rdma_cm.h | 24 +++- > > include/rdma/restrack.h | 4 + > > include/uapi/rdma/rdma_netlink.h | 30 +++++ > > 7 files changed, 365 insertions(+), 30 deletions(-) > > > > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c > > index 72ad52b..51fbfa1 100644 > > --- a/drivers/infiniband/core/cma.c > > +++ b/drivers/infiniband/core/cma.c > > @@ -465,6 +465,9 @@ static void _cma_attach_to_dev(struct > rdma_id_private *id_priv, > > id_priv->id.route.addr.dev_addr.transport = > > rdma_node_get_transport(cma_dev->device->node_type); > > list_add_tail(&id_priv->list, &cma_dev->id_list); > > + id_priv->id.res.type = RDMA_RESTRACK_CM_ID; > > + id_priv->id.res.kern_name = id_priv->id.caller; > > Steve, I don't like it, I worked hard to hide it from the users of restrack, > and don't see reason why the same trick as with ib_create_cq/ib_create_pd > won't > work here. I am doing the same trick, no? rdma_create_id() is a static inline that passes KBUILD_MODNAME. The issue is that at the time the rdma_cm_id is created, it is not associated with any ib_device. That only happens at cma_attach time. So how can the resource be added if there is no device? > > > + rdma_restrack_add(&id_priv->id.res); > > } > > > > static void cma_attach_to_dev(struct rdma_id_private *id_priv, > > @@ -737,10 +740,10 @@ static void cma_deref_id(struct rdma_id_private > *id_priv) > > complete(&id_priv->comp); > > } > > > > -struct rdma_cm_id *rdma_create_id(struct net *net, > > - rdma_cm_event_handler event_handler, > > - void *context, enum rdma_port_space ps, > > - enum ib_qp_type qp_type) > > +struct rdma_cm_id *__rdma_create_id(struct net *net, > > + rdma_cm_event_handler event_handler, > > + void *context, enum rdma_port_space ps, > > + enum ib_qp_type qp_type, const char *caller) > > { > > struct rdma_id_private *id_priv; > > > > @@ -748,7 +751,10 @@ struct rdma_cm_id *rdma_create_id(struct net *net, > > if (!id_priv) > > return ERR_PTR(-ENOMEM); > > > > - id_priv->owner = task_pid_nr(current); > > + if (caller) > > + id_priv->id.caller = caller; > > + else > > + id_priv->owner = task_pid_nr(current); > > See the comment above There is no ib_device at this point, so caller (and owner) must be saved until the cm_id is bound to a device (or possibly devices for listening ids). > > > id_priv->state = RDMA_CM_IDLE; > > id_priv->id.context = context; > > id_priv->id.event_handler = event_handler; > > Not saying that we need to do it now, but it is important to write, most > probably we can drop certain initialization from rdma_create_id() adn > reuse rdma_restrack_put/_get. > I don't understand this comment. Can you please elaborate? > > @@ -768,7 +774,7 @@ struct rdma_cm_id *rdma_create_id(struct net *net, > > > > return &id_priv->id; > > } > > -EXPORT_SYMBOL(rdma_create_id); > > +EXPORT_SYMBOL(__rdma_create_id); > > > > static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) > > { > > @@ -1628,6 +1634,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) > > mutex_unlock(&id_priv->handler_mutex); > > > > if (id_priv->cma_dev) { > > + rdma_restrack_del(&id_priv->id.res); > > You should count all created cm_ids and not only binded. No ib_device if they aren't bound. > > > if (rdma_cap_ib_cm(id_priv->id.device, 1)) { > > if (id_priv->cm_id.ib) > > ib_destroy_cm_id(id_priv->cm_id.ib); > > @@ -1786,9 +1793,10 @@ static struct rdma_id_private > *cma_new_conn_id(struct rdma_cm_id *listen_id, > > ib_event->param.req_rcvd.primary_path->service_id; > > int ret; > > > > - id = rdma_create_id(listen_id->route.addr.dev_addr.net, > > + id = __rdma_create_id(listen_id->route.addr.dev_addr.net, > > listen_id->event_handler, listen_id->context, > > - listen_id->ps, ib_event->param.req_rcvd.qp_type); > > + listen_id->ps, ib_event->param.req_rcvd.qp_type, > > + listen_id->caller); > > I think the cleanest way will be to create some struct and pass pointer to it so > you can unfold all relevant data inside of __rdma_create_id(). > Why is that cleaner? Marshall up the data into a struct, pass a ptr, unmarshall it all... > > if (IS_ERR(id)) > > return NULL; > > > > @@ -1843,8 +1851,8 @@ static struct rdma_id_private > *cma_new_udp_id(struct rdma_cm_id *listen_id, > > struct net *net = listen_id->route.addr.dev_addr.net; > > int ret; > > > > - id = rdma_create_id(net, listen_id->event_handler, listen_id->context, > > - listen_id->ps, IB_QPT_UD); > > + id = __rdma_create_id(net, listen_id->event_handler, listen_id- > >context, > > + listen_id->ps, IB_QPT_UD, listen_id->caller); > > if (IS_ERR(id)) > > return NULL; > > > > @@ -2110,10 +2118,11 @@ static int iw_conn_req_handler(struct iw_cm_id > *cm_id, > > goto out; > > > > /* Create a new RDMA id for the new IW CM ID */ > > - new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, > > - listen_id->id.event_handler, > > - listen_id->id.context, > > - RDMA_PS_TCP, IB_QPT_RC); > > + new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, > > + listen_id->id.event_handler, > > + listen_id->id.context, > > + RDMA_PS_TCP, IB_QPT_RC, > > + listen_id->id.caller); > > if (IS_ERR(new_cm_id)) { > > ret = -ENOMEM; > > goto out; > > @@ -2238,8 +2247,8 @@ static void cma_listen_on_dev(struct > rdma_id_private *id_priv, > > if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev- > >device, 1)) > > return; > > > > - id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, > > - id_priv->id.qp_type); > > + id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, > > + id_priv->id.qp_type, id_priv->id.caller); > > if (IS_ERR(id)) > > return; > > > > @@ -3347,8 +3356,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct > sockaddr *addr) > > > > return 0; > > err2: > > - if (id_priv->cma_dev) > > + if (id_priv->cma_dev) { > > + rdma_restrack_del(&id_priv->id.res); > > cma_release_dev(id_priv); > > + } > > err1: > > cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); > > return ret; > > @@ -3731,14 +3742,18 @@ static int cma_send_sidr_rep(struct > rdma_id_private *id_priv, > > return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); > > } > > > > -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > *conn_param) > > +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > *conn_param, > > + const char *caller) > > { > > struct rdma_id_private *id_priv; > > int ret; > > > > id_priv = container_of(id, struct rdma_id_private, id); > > > > - id_priv->owner = task_pid_nr(current); > > + if (caller) > > + id_priv->id.caller = caller; > > + else > > + id_priv->owner = task_pid_nr(current); > > > > if (!cma_comp(id_priv, RDMA_CM_CONNECT)) > > return -EINVAL; > > @@ -3778,7 +3793,7 @@ int rdma_accept(struct rdma_cm_id *id, struct > rdma_conn_param *conn_param) > > rdma_reject(id, NULL, 0); > > return ret; > > } > > -EXPORT_SYMBOL(rdma_accept); > > +EXPORT_SYMBOL(__rdma_accept); > > > > int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) > > { > > diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c > > index fa8655e..a4091b5 100644 > > --- a/drivers/infiniband/core/nldev.c > > +++ b/drivers/infiniband/core/nldev.c > > @@ -34,6 +34,7 @@ > > #include <linux/pid.h> > > #include <linux/pid_namespace.h> > > #include <net/netlink.h> > > +#include <rdma/rdma_cm.h> > > #include <rdma/rdma_netlink.h> > > > > #include "core_priv.h" > > @@ -71,6 +72,22 @@ > > [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, > > [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = > NLA_NUL_STRING, > > .len = TASK_COMM_LEN }, > > + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = > NLA_NESTED }, > > I would like to use this opportunity. There is CM_ID, so users will be > able to query nldev directly on this ID (once we implement .doit), but > can we found a proper abstraction for other objects (PD, CQ, QP e.t.c.)? > > I want to have that all resources will have something similar to ifindex/ibindex. > Pds, cqs, and qps, all have a device-unique number. So ibindex/restrack_type/object_id should work. But cm_id's don't have that. Similar to a socket I guess. So I'm not sure how to identify cm_ids other than by the ipaddresses/ip ports. > > > + [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = > NLA_NESTED }, > > + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, > > + [RDMA_NLDEV_ATTR_RES_IPV4_SADDR] = { > > + .len = FIELD_SIZEOF(struct iphdr, saddr) }, > > + [RDMA_NLDEV_ATTR_RES_IPV4_DADDR] = { > > + .len = FIELD_SIZEOF(struct iphdr, saddr) }, > > + [RDMA_NLDEV_ATTR_RES_IPV6_SADDR] = { > > + .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, > > + [RDMA_NLDEV_ATTR_RES_IPV6_DADDR] = { > > + .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, > > + [RDMA_NLDEV_ATTR_RES_IP_SPORT] = { .type = NLA_U16 }, > > + [RDMA_NLDEV_ATTR_RES_IP_DPORT] = { .type = NLA_U16 }, > > + [RDMA_NLDEV_ATTR_RES_DEV_TYPE] = { .type = NLA_U8 }, > > + [RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE] = { .type = NLA_U8 }, > > + [RDMA_NLDEV_ATTR_RES_NETWORK_TYPE] = { .type = NLA_U8 }, > > }; > > > > static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) > > @@ -182,6 +199,7 @@ static int fill_res_info(struct sk_buff *msg, struct > ib_device *device) > > [RDMA_RESTRACK_PD] = "pd", > > [RDMA_RESTRACK_CQ] = "cq", > > [RDMA_RESTRACK_QP] = "qp", > > + [RDMA_RESTRACK_CM_ID] = "cm_id", > > }; > > > > struct rdma_restrack_root *res = &device->res; > > @@ -284,6 +302,99 @@ static int fill_res_qp_entry(struct sk_buff *msg, > > return -EMSGSIZE; > > } > > > > +static int fill_res_cm_id_entry(struct sk_buff *msg, > > + struct rdma_cm_id *cm_id, uint32_t port) > > +{ > > + struct rdma_id_private *id_priv; > > + struct nlattr *entry_attr; > > + > > + if (port && port != cm_id->port_num) > > + return 0; > > + > > + id_priv = container_of(cm_id, struct rdma_id_private, id); > > + entry_attr = nla_nest_start(msg, > RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY); > > + if (!entry_attr) > > + goto out; > > + > > + if (cm_id->port_num && > > + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id- > >port_num)) > > + goto err; > > + > > + if (id_priv->qp_num && > > + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv- > >qp_num)) > > + goto err; > > + > > + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps)) > > + goto err; > > + > > + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type)) > > + goto err; > > + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state)) > > + goto err; > > + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_DEV_TYPE, > > + id_priv->id.route.addr.dev_addr.dev_type)) > > + goto err; > > + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE, > > + id_priv->id.route.addr.dev_addr.transport)) > > + goto err; > > + > > + if (cm_id->route.addr.src_addr.ss_family == AF_INET) { > > + struct sockaddr_in *sin; > > + > > + sin = (struct sockaddr_in *)&cm_id->route.addr.src_addr; > > + if (nla_put_in_addr(msg, > RDMA_NLDEV_ATTR_RES_IPV4_SADDR, > > + sin->sin_addr.s_addr)) > > + goto err; > > + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_SPORT, > > + sin->sin_port)) > > + goto err; > > + > > + sin = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; > > + if (nla_put_in_addr(msg, > RDMA_NLDEV_ATTR_RES_IPV4_DADDR, > > + sin->sin_addr.s_addr)) > > + goto err; > > + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_DPORT, > > + sin->sin_port)) > > + goto err; > > + } else { > > + struct sockaddr_in6 *sin6; > > + > > + sin6 = (struct sockaddr_in6 *)&cm_id->route.addr.src_addr; > > + if (nla_put_in6_addr(msg, > RDMA_NLDEV_ATTR_RES_IPV6_SADDR, > > + &sin6->sin6_addr)) > > + goto err; > > + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_SPORT, > > + sin6->sin6_port)) > > + goto err; > > + > > + sin6 = (struct sockaddr_in6 *)&cm_id->route.addr.dst_addr; > > + if (nla_put_in6_addr(msg, > RDMA_NLDEV_ATTR_RES_IPV6_DADDR, > > + &sin6->sin6_addr)) > > + goto err; > > + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_DPORT, > > + sin6->sin6_port)) > > + goto err; > > + } > > + > > + if (id_priv->id.caller) { > > + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, > > + id_priv->id.caller)) > > + goto err; > > + } else { > > + /* CMA keeps the owning pid. */ > > + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, id_priv- > >owner)) > > + goto err; > > + } > > + > > + nla_nest_end(msg, entry_attr); > > + return 0; > > + > > +err: > > + nla_nest_cancel(msg, entry_attr); > > +out: > > + return -EMSGSIZE; > > +} > > + > > static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, > > struct netlink_ext_ack *extack) > > { > > @@ -686,6 +797,137 @@ static int nldev_res_get_qp_dumpit(struct sk_buff > *skb, > > return ret; > > } > > > > +static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb, > > + struct netlink_callback *cb) > > +{ > > + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; > > + struct rdma_restrack_entry *res; > > + int err, ret = 0, idx = 0; > > + struct nlattr *table_attr; > > + struct ib_device *device; > > + int start = cb->args[0]; > > + struct rdma_cm_id *cm_id = NULL; > > + struct nlmsghdr *nlh; > > + u32 index, port = 0; > > + > > + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, > > + nldev_policy, NULL); > > + /* > > + * Right now, we are expecting the device index to get QP information, > > + * but it is possible to extend this code to return all devices in > > + * one shot by checking the existence of > RDMA_NLDEV_ATTR_DEV_INDEX. > > + * if it doesn't exist, we will iterate over all devices. > > + * > > + * But it is not needed for now. > > + */ > > + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) > > + return -EINVAL; > > + > > + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); > > + device = ib_device_get_by_index(index); > > + if (!device) > > + return -EINVAL; > > + > > + /* > > + * If no PORT_INDEX is supplied, we will return all QPs from that device > > QPs->CM_IDs > Oops! > > + */ > > + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { > > + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); > > + if (!rdma_is_port_valid(device, port)) { > > + ret = -EINVAL; > > + goto err_index; > > + } > > + } > > + > > + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > > + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, > RDMA_NLDEV_CMD_RES_QP_GET), > > RDMA_NLDEV_CMD_RES_QP_GET -> RDMA_NLDEV_CMD_RES_CM_ID_GET > Oops2! > > + 0, NLM_F_MULTI); > > + > > + if (fill_nldev_handle(skb, device)) { > > + ret = -EMSGSIZE; > > + goto err; > > + } > > + > > + table_attr = nla_nest_start(skb, RDMA_NLDEV_ATTR_RES_CM_ID); > > + if (!table_attr) { > > + ret = -EMSGSIZE; > > + goto err; > > + } > > + > > + down_read(&device->res.rwsem); > > + hash_for_each_possible(device->res.hash, res, node, > > + RDMA_RESTRACK_CM_ID) { > > + if (idx < start) > > + goto next; > > + > > + if ((rdma_is_kernel_res(res) && > > + task_active_pid_ns(current) != &init_pid_ns) || > > + (!rdma_is_kernel_res(res) && > > + task_active_pid_ns(current) != > > + task_active_pid_ns(res->task))) > > + /* > > + * 1. Kernel QPs should be visible in init namsapce only > > + * 2. Preent only QPs visible in the current namespace > > + */ > > + goto next; > > + > > + if (!rdma_restrack_get(res)) > > + /* > > + * Resource is under release now, but we are not > > + * relesing lock now, so it will be released in > > + * our next pass, once we will get ->next pointer. > > + */ > > + goto next; > > + > > + cm_id = container_of(res, struct rdma_cm_id, res); > > + > > + up_read(&device->res.rwsem); > > + ret = fill_res_cm_id_entry(skb, cm_id, port); > > + down_read(&device->res.rwsem); > > + /* > > + * Return resource back, but it won't be released till > > + * the &device->res.rwsem will be released for write. > > + */ > > + rdma_restrack_put(res); > > + > > + if (ret == -EMSGSIZE) > > + /* > > + * There is a chance to optimize here. > > + * It can be done by using list_prepare_entry > > + * and list_for_each_entry_continue afterwards. > > + */ > > + break; > > + if (ret) > > + goto res_err; > > +next: idx++; > > + } > > + up_read(&device->res.rwsem); > > + > > + nla_nest_end(skb, table_attr); > > + nlmsg_end(skb, nlh); > > + cb->args[0] = idx; > > + > > + /* > > + * No more CM_IDs to fill, cancel the message and > > + * return 0 to mark end of dumpit. > > + */ > > + if (!cm_id) > > + goto err; > > + > > + put_device(&device->dev); > > + return skb->len; > > + > > +res_err: > > + nla_nest_cancel(skb, table_attr); > > + up_read(&device->res.rwsem); > > + > > +err: > > + nlmsg_cancel(skb, nlh); > > + > > +err_index: > > + put_device(&device->dev); > > + return ret; > > +} > > static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { > > [RDMA_NLDEV_CMD_GET] = { > > .doit = nldev_get_doit, > > @@ -712,6 +954,9 @@ static int nldev_res_get_qp_dumpit(struct sk_buff > *skb, > > * too. > > */ > > }, > > + [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { > > + .dump = nldev_res_get_cm_id_dumpit, > > + }, > > }; > > > > void __init nldev_init(void) > > diff --git a/drivers/infiniband/core/restrack.c > b/drivers/infiniband/core/restrack.c > > index 857637b..bb13169 100644 > > --- a/drivers/infiniband/core/restrack.c > > +++ b/drivers/infiniband/core/restrack.c > > @@ -3,6 +3,7 @@ > > * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. > > */ > > > > +#include <rdma/rdma_cm.h> > > #include <rdma/ib_verbs.h> > > #include <rdma/restrack.h> > > #include <linux/mutex.h> > > @@ -67,6 +68,7 @@ static struct ib_device *res_to_dev(struct > rdma_restrack_entry *res) > > struct ib_pd *pd; > > struct ib_cq *cq; > > struct ib_qp *qp; > > + struct rdma_cm_id *cm_id; > > > > switch (type) { > > case RDMA_RESTRACK_PD: > > @@ -85,6 +87,10 @@ static struct ib_device *res_to_dev(struct > rdma_restrack_entry *res) > > xrcd = container_of(res, struct ib_xrcd, res); > > dev = xrcd->device; > > break; > > + case RDMA_RESTRACK_CM_ID: > > + cm_id = container_of(res, struct rdma_cm_id, res); > > + dev = cm_id->device; > > + break; > > default: > > WARN_ONCE(true, "Wrong resource tracking type %u\n", type); > > return NULL; > > @@ -93,6 +99,27 @@ static struct ib_device *res_to_dev(struct > rdma_restrack_entry *res) > > return dev; > > } > > > > +static bool res_is_user(struct rdma_restrack_entry *res) > > +{ > > + enum rdma_restrack_type type = res->type; > > + bool is_user; > > + > > + switch (type) { > > + case RDMA_RESTRACK_CM_ID: { > > + struct rdma_cm_id *cm_id; > > + > > + cm_id = container_of(res, struct rdma_cm_id, res); > > + is_user = !cm_id->caller; > > + break; > > + } > > + default: > > + is_user = !uaccess_kernel(); > > + break; > > + } > > + > > + return is_user; > > +} > > + > > void rdma_restrack_add(struct rdma_restrack_entry *res) > > { > > struct ib_device *dev = res_to_dev(res); > > @@ -100,7 +127,7 @@ void rdma_restrack_add(struct rdma_restrack_entry > *res) > > if (!dev) > > return; > > > > - if (!uaccess_kernel()) { > > + if (res_is_user(res)) { > > get_task_struct(current); > > res->task = current; > > res->kern_name = NULL; > > diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c > > index d67219d..f7f0282 100644 > > --- a/drivers/infiniband/core/ucma.c > > +++ b/drivers/infiniband/core/ucma.c > > @@ -476,8 +476,8 @@ static ssize_t ucma_create_id(struct ucma_file *file, > const char __user *inbuf, > > return -ENOMEM; > > > > ctx->uid = cmd.uid; > > - ctx->cm_id = rdma_create_id(current->nsproxy->net_ns, > > - ucma_event_handler, ctx, cmd.ps, qp_type); > > + ctx->cm_id = __rdma_create_id(current->nsproxy->net_ns, > > + ucma_event_handler, ctx, cmd.ps, qp_type, NULL); > > if (IS_ERR(ctx->cm_id)) { > > ret = PTR_ERR(ctx->cm_id); > > goto err1; > > @@ -1084,12 +1084,12 @@ static ssize_t ucma_accept(struct ucma_file *file, > const char __user *inbuf, > > if (cmd.conn_param.valid) { > > ucma_copy_conn_param(ctx->cm_id, &conn_param, > &cmd.conn_param); > > mutex_lock(&file->mut); > > - ret = rdma_accept(ctx->cm_id, &conn_param); > > + ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); > > if (!ret) > > ctx->uid = cmd.uid; > > mutex_unlock(&file->mut); > > } else > > - ret = rdma_accept(ctx->cm_id, NULL); > > + ret = __rdma_accept(ctx->cm_id, NULL, NULL); > > > > ucma_put_ctx(ctx); > > return ret; > > diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h > > index 5984225..73dac9b 100644 > > --- a/include/rdma/rdma_cm.h > > +++ b/include/rdma/rdma_cm.h > > @@ -155,6 +155,12 @@ struct rdma_cm_id { > > enum rdma_port_space ps; > > enum ib_qp_type qp_type; > > u8 port_num; > > + const char *caller; > > + > > + /* > > + * Internal to RDMA/core, don't use in the drivers > > + */ > > + struct rdma_restrack_entry res; > > }; > > > > struct rdma_id_private { > > @@ -198,6 +204,11 @@ struct rdma_id_private { > > enum ib_gid_type gid_type; > > }; > > > > +struct rdma_cm_id *__rdma_create_id(struct net *net, > > + rdma_cm_event_handler event_handler, > > + void *context, enum rdma_port_space ps, > > + enum ib_qp_type qp_type, const char *caller); > > + > > /** > > * rdma_create_id - Create an RDMA identifier. > > * > > @@ -210,10 +221,9 @@ struct rdma_id_private { > > * > > * The id holds a reference on the network namespace until it is destroyed. > > */ > > -struct rdma_cm_id *rdma_create_id(struct net *net, > > - rdma_cm_event_handler event_handler, > > - void *context, enum rdma_port_space ps, > > - enum ib_qp_type qp_type); > > +#define rdma_create_id(net, event_handler, context, ps, qp_type) \ > > + __rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \ > > + KBUILD_MODNAME) > > > > /** > > * rdma_destroy_id - Destroys an RDMA identifier. > > @@ -325,6 +335,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct > ib_qp_attr *qp_attr, > > */ > > int rdma_listen(struct rdma_cm_id *id, int backlog); > > > > +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > *conn_param, > > + const char *caller); > > + > > /** > > * rdma_accept - Called to accept a connection request or response. > > * @id: Connection identifier associated with the request. > > @@ -340,7 +353,8 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct > ib_qp_attr *qp_attr, > > * state of the qp associated with the id is modified to error, such that any > > * previously posted receive buffers would be flushed. > > */ > > -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > *conn_param); > > +#define rdma_accept(id, conn_param) \ > > + __rdma_accept((id), (conn_param), KBUILD_MODNAME) > > > > /** > > * rdma_notify - Notifies the RDMA CM of an asynchronous event that has > > diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h > > index c2d8116..a794e0e 100644 > > --- a/include/rdma/restrack.h > > +++ b/include/rdma/restrack.h > > @@ -33,6 +33,10 @@ enum rdma_restrack_type { > > */ > > RDMA_RESTRACK_XRCD, > > /** > > + * @RDMA_RESTRACK_CM_ID: Connection Manager ID (CM_ID) > > + */ > > + RDMA_RESTRACK_CM_ID, > > + /** > > * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations > > */ > > RDMA_RESTRACK_MAX > > diff --git a/include/uapi/rdma/rdma_netlink.h > b/include/uapi/rdma/rdma_netlink.h > > index 17e59be..4ed21ee 100644 > > --- a/include/uapi/rdma/rdma_netlink.h > > +++ b/include/uapi/rdma/rdma_netlink.h > > @@ -240,6 +240,8 @@ enum rdma_nldev_command { > > > > RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */ > > > > + RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */ > > + > > RDMA_NLDEV_NUM_OPS > > }; > > > > @@ -352,6 +354,34 @@ enum rdma_nldev_attr { > > */ > > RDMA_NLDEV_ATTR_RES_KERN_NAME, /* string */ > > > > + RDMA_NLDEV_ATTR_RES_CM_ID, /* nested table */ > > + RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, /* nested table */ > > + /* > > + * rdma_cm_id port space. > > + */ > > + RDMA_NLDEV_ATTR_RES_PS, /* u32 */ > > + /* > > + * IP Addresses/port attributes. > > + */ > > + RDMA_NLDEV_ATTR_RES_IPV4_SADDR, /* u8[4] */ > > + RDMA_NLDEV_ATTR_RES_IPV4_DADDR, /* u8[4] */ > > + RDMA_NLDEV_ATTR_RES_IPV6_SADDR, /* u8[16] */ > > + RDMA_NLDEV_ATTR_RES_IPV6_DADDR, /* u8[16] */ > > + RDMA_NLDEV_ATTR_RES_IP_SPORT, /* BE u16 */ > > + RDMA_NLDEV_ATTR_RES_IP_DPORT, /* BE u16 */ > > Can you please document the meaning of S (source) and D (destination) > in regards of this netlink output? It is needed to remove ambiguity. Sure. > > > + /* > > + * ARPHRD_INFINIBAND, ARPHRD_ETHER, ... > > + */ > > + RDMA_NLDEV_ATTR_RES_DEV_TYPE, /* u8 */ > > + /* > > + * enum enum rdma_transport_type (IB, IWARP, ...) > > + */ > > + RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE, /* u8 */ > > + /* > > + * enum rdma_network_type (IB, IPv4, IPv6,...) > > + */ > > + RDMA_NLDEV_ATTR_RES_NETWORK_TYPE, /* u8 */ > > + > > RDMA_NLDEV_ATTR_MAX > > }; > > #endif /* _UAPI_RDMA_NETLINK_H */ > > -- > > 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Feb 01, 2018 at 10:01:09AM +0200, Leon Romanovsky wrote: > Regarding the name, I personally think that cm_id is better because it > is general and applicable both to ib_cm and iw_cm. The separation > between them can be done with introduction of new netlink attribute, e.g. > cm_id_type. In netlink the attributes should be self-describing, or intrinsically related to something mandatory and fundamental about their container (eg AF_ family in rtnl) What information do we actually need from the cm_id in various protocol families? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Feb 01, 2018 at 10:49:44AM +0200, Leon Romanovsky wrote: > > + RDMA_NLDEV_ATTR_RES_IPV4_SADDR, /* u8[4] */ > > + RDMA_NLDEV_ATTR_RES_IPV4_DADDR, /* u8[4] */ > > + RDMA_NLDEV_ATTR_RES_IPV6_SADDR, /* u8[16] */ > > + RDMA_NLDEV_ATTR_RES_IPV6_DADDR, /* u8[16] */ > > + RDMA_NLDEV_ATTR_RES_IP_SPORT, /* BE u16 */ > > + RDMA_NLDEV_ATTR_RES_IP_DPORT, /* BE u16 */ > > Can you please document the meaning of S (source) and D (destination) > in regards of this netlink output? It is needed to remove ambiguity. And no on BE's in netlink, I think. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On Thu, Feb 01, 2018 at 10:01:09AM +0200, Leon Romanovsky wrote: > > > Regarding the name, I personally think that cm_id is better because it > > is general and applicable both to ib_cm and iw_cm. The separation > > between them can be done with introduction of new netlink attribute, e.g. > > cm_id_type. > > In netlink the attributes should be self-describing, or intrinsically > related to something mandatory and fundamental about their container > (eg AF_ family in rtnl) > > What information do we actually need from the cm_id in various > protocol families? > Looking at iw_cm_id and iwcm_id_private: tos, mapped, state, flags, refcount. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: Jason Gunthorpe [mailto:jgg@ziepe.ca] > Sent: Thursday, February 01, 2018 11:54 AM > To: Leon Romanovsky > Cc: Steve Wise; dledford@redhat.com; linux-rdma@vger.kernel.org > Subject: Re: [PATCH RFC 2/2] RDMA/nldev: provide detailed CM_ID information > > On Thu, Feb 01, 2018 at 10:49:44AM +0200, Leon Romanovsky wrote: > > > > + RDMA_NLDEV_ATTR_RES_IPV4_SADDR, /* u8[4] */ > > > + RDMA_NLDEV_ATTR_RES_IPV4_DADDR, /* u8[4] */ > > > + RDMA_NLDEV_ATTR_RES_IPV6_SADDR, /* u8[16] */ > > > + RDMA_NLDEV_ATTR_RES_IPV6_DADDR, /* u8[16] */ > > > + RDMA_NLDEV_ATTR_RES_IP_SPORT, /* BE u16 */ > > > + RDMA_NLDEV_ATTR_RES_IP_DPORT, /* BE u16 */ > > > > Can you please document the meaning of S (source) and D (destination) > > in regards of this netlink output? It is needed to remove ambiguity. > > And no on BE's in netlink, I think. Sure there are, see nla_put_be32() for example. I'm using nla_put_net16() to store the ports. Perhaps I need to change the above comment to ne16 instead of BE u16? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Feb 01, 2018 at 12:18:05PM -0600, Steve Wise wrote: > > > > From: Jason Gunthorpe [mailto:jgg@ziepe.ca] > > Sent: Thursday, February 01, 2018 11:54 AM > > To: Leon Romanovsky > > Cc: Steve Wise; dledford@redhat.com; linux-rdma@vger.kernel.org > > Subject: Re: [PATCH RFC 2/2] RDMA/nldev: provide detailed CM_ID > information > > > > On Thu, Feb 01, 2018 at 10:49:44AM +0200, Leon Romanovsky wrote: > > > > > > + RDMA_NLDEV_ATTR_RES_IPV4_SADDR, /* u8[4] */ > > > > + RDMA_NLDEV_ATTR_RES_IPV4_DADDR, /* u8[4] */ > > > > + RDMA_NLDEV_ATTR_RES_IPV6_SADDR, /* u8[16] */ > > > > + RDMA_NLDEV_ATTR_RES_IPV6_DADDR, /* u8[16] */ > > > > + RDMA_NLDEV_ATTR_RES_IP_SPORT, /* BE u16 */ > > > > + RDMA_NLDEV_ATTR_RES_IP_DPORT, /* BE u16 */ > > > > > > Can you please document the meaning of S (source) and D (destination) > > > in regards of this netlink output? It is needed to remove ambiguity. > > > > And no on BE's in netlink, I think. > > > Sure there are, see nla_put_be32() for example. I'm using nla_put_net16() to > store the ports. Perhaps I need to change the above comment to ne16 instead > of BE u16? There is no reason thes ports should be be, so don't make it be. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > > On Thu, Feb 01, 2018 at 10:49:44AM +0200, Leon Romanovsky wrote: > > > > > > > > + RDMA_NLDEV_ATTR_RES_IPV4_SADDR, /* u8[4] */ > > > > > + RDMA_NLDEV_ATTR_RES_IPV4_DADDR, /* u8[4] */ > > > > > + RDMA_NLDEV_ATTR_RES_IPV6_SADDR, /* u8[16] */ > > > > > + RDMA_NLDEV_ATTR_RES_IPV6_DADDR, /* u8[16] */ > > > > > + RDMA_NLDEV_ATTR_RES_IP_SPORT, /* BE u16 */ > > > > > + RDMA_NLDEV_ATTR_RES_IP_DPORT, /* BE u16 */ > > > > > > > > Can you please document the meaning of S (source) and D (destination) > > > > in regards of this netlink output? It is needed to remove ambiguity. > > > > > > And no on BE's in netlink, I think. > > > > > > Sure there are, see nla_put_be32() for example. I'm using nla_put_net16() to > > store the ports. Perhaps I need to change the above comment to ne16 instead > > of BE u16? > > There is no reason thes ports should be be, so don't make it be. Well, they're BE on the wire. That's the reason. And other APIs return them to user space in their NBO, like getsockname(). You would rather the nldev code puts it in HBO before sending it up? Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Feb 01, 2018 at 12:37:31PM -0600, Steve Wise wrote: > > > > On Thu, Feb 01, 2018 at 10:49:44AM +0200, Leon Romanovsky wrote: > > > > > > > > > > + RDMA_NLDEV_ATTR_RES_IPV4_SADDR, /* u8[4] */ > > > > > > + RDMA_NLDEV_ATTR_RES_IPV4_DADDR, /* u8[4] */ > > > > > > + RDMA_NLDEV_ATTR_RES_IPV6_SADDR, /* u8[16] */ > > > > > > + RDMA_NLDEV_ATTR_RES_IPV6_DADDR, /* u8[16] */ > > > > > > + RDMA_NLDEV_ATTR_RES_IP_SPORT, /* BE u16 */ > > > > > > + RDMA_NLDEV_ATTR_RES_IP_DPORT, /* BE u16 */ > > > > > > > > > > Can you please document the meaning of S (source) and D > (destination) > > > > > in regards of this netlink output? It is needed to remove ambiguity. > > > > > > > > And no on BE's in netlink, I think. > > > > > > > > > Sure there are, see nla_put_be32() for example. I'm using > nla_put_net16() to > > > store the ports. Perhaps I need to change the above comment to ne16 > instead > > > of BE u16? > > > > There is no reason thes ports should be be, so don't make it be. > > Well, they're BE on the wire. That's the reason. And other APIs return > them to user space in their NBO, like getsockname(). > > You would rather the nldev code puts it in HBO before sending it up? Yes, we have far too much 'guess the endian' in our uapi, it doesn't make sense in modern HW, the swap costs nothing. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Feb 01, 2018 at 10:07:20AM -0600, Steve Wise wrote: > > > > > On Tue, Jan 30, 2018 at 08:59:11AM -0800, Steve Wise wrote: > > > Implement RDMA nldev netlink interface to get detailed CM_ID > information. > > > > > > Because cm_id's are attached to rdma devices in various work queue > contexts, > > > the pid and task information at device-attach time is sometimes not > useful. > > > For example, an nvme/f host connection ends up being bound to a device > > > in a work queue context and the resulting pid at attach time no longer > exists > > > after connection setup. So instead we mark all cm_id's created via the > > > rdma_ucm as "user", and all others as "kernel". This required tweaking > > > the restrack code a little. It also required wrapping some rdma_cm > > > functions to allow passing the module name string. > > > > > > Signed-off-by: Steve Wise <swise@opengridcomputing.com> > > > --- > > > drivers/infiniband/core/cma.c | 55 ++++++--- > > > drivers/infiniband/core/nldev.c | 245 > > +++++++++++++++++++++++++++++++++++++ > > > drivers/infiniband/core/restrack.c | 29 ++++- > > > drivers/infiniband/core/ucma.c | 8 +- > > > include/rdma/rdma_cm.h | 24 +++- > > > include/rdma/restrack.h | 4 + > > > include/uapi/rdma/rdma_netlink.h | 30 +++++ > > > 7 files changed, 365 insertions(+), 30 deletions(-) > > > > > > diff --git a/drivers/infiniband/core/cma.c > b/drivers/infiniband/core/cma.c > > > index 72ad52b..51fbfa1 100644 > > > --- a/drivers/infiniband/core/cma.c > > > +++ b/drivers/infiniband/core/cma.c > > > @@ -465,6 +465,9 @@ static void _cma_attach_to_dev(struct > > rdma_id_private *id_priv, > > > id_priv->id.route.addr.dev_addr.transport = > > > rdma_node_get_transport(cma_dev->device->node_type); > > > list_add_tail(&id_priv->list, &cma_dev->id_list); > > > + id_priv->id.res.type = RDMA_RESTRACK_CM_ID; > > > + id_priv->id.res.kern_name = id_priv->id.caller; > > > > Steve, I don't like it, I worked hard to hide it from the users of > restrack, > > and don't see reason why the same trick as with ib_create_cq/ib_create_pd > > won't > > work here. > > I am doing the same trick, no? rdma_create_id() is a static inline that > passes KBUILD_MODNAME. The issue is that at the time the rdma_cm_id is > created, it is not associated with any ib_device. That only happens at > cma_attach time. So how can the resource be added if there is no device? > So maybe, we don't need to add resource to the DB at rdma_create_id stage and do it in cma_attach only, and in that stage you will update the kern_name with KBUILD_MODNAME. > > > > > + rdma_restrack_add(&id_priv->id.res); > > > } > > > > > > static void cma_attach_to_dev(struct rdma_id_private *id_priv, > > > @@ -737,10 +740,10 @@ static void cma_deref_id(struct rdma_id_private > > *id_priv) > > > complete(&id_priv->comp); > > > } > > > > > > -struct rdma_cm_id *rdma_create_id(struct net *net, > > > - rdma_cm_event_handler event_handler, > > > - void *context, enum rdma_port_space ps, > > > - enum ib_qp_type qp_type) > > > +struct rdma_cm_id *__rdma_create_id(struct net *net, > > > + rdma_cm_event_handler event_handler, > > > + void *context, enum rdma_port_space ps, > > > + enum ib_qp_type qp_type, const char > *caller) > > > { > > > struct rdma_id_private *id_priv; > > > > > > @@ -748,7 +751,10 @@ struct rdma_cm_id *rdma_create_id(struct net *net, > > > if (!id_priv) > > > return ERR_PTR(-ENOMEM); > > > > > > - id_priv->owner = task_pid_nr(current); > > > + if (caller) > > > + id_priv->id.caller = caller; > > > + else > > > + id_priv->owner = task_pid_nr(current); > > > > See the comment above > > There is no ib_device at this point, so caller (and owner) must be saved > until the cm_id is bound to a device (or possibly devices for listening > ids). Why do we need previous owner? Can it be that rdma_create_id was performed by one process and cma_attach by another? > > > > > > id_priv->state = RDMA_CM_IDLE; > > > id_priv->id.context = context; > > > id_priv->id.event_handler = event_handler; > > > > Not saying that we need to do it now, but it is important to write, most > > probably we can drop certain initialization from rdma_create_id() adn > > reuse rdma_restrack_put/_get. > > > > I don't understand this comment. Can you please elaborate? Most probably, we will be able to drop id_priv->qp_mutex in the future, but let's not discuss it now, it is not related to this series. > > > > @@ -768,7 +774,7 @@ struct rdma_cm_id *rdma_create_id(struct net *net, > > > > > > return &id_priv->id; > > > } > > > -EXPORT_SYMBOL(rdma_create_id); > > > +EXPORT_SYMBOL(__rdma_create_id); > > > > > > static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp > *qp) > > > { > > > @@ -1628,6 +1634,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) > > > mutex_unlock(&id_priv->handler_mutex); > > > > > > if (id_priv->cma_dev) { > > > + rdma_restrack_del(&id_priv->id.res); > > > > You should count all created cm_ids and not only binded. > > No ib_device if they aren't bound. > > > > > > if (rdma_cap_ib_cm(id_priv->id.device, 1)) { > > > if (id_priv->cm_id.ib) > > > ib_destroy_cm_id(id_priv->cm_id.ib); > > > @@ -1786,9 +1793,10 @@ static struct rdma_id_private > > *cma_new_conn_id(struct rdma_cm_id *listen_id, > > > ib_event->param.req_rcvd.primary_path->service_id; > > > int ret; > > > > > > - id = rdma_create_id(listen_id->route.addr.dev_addr.net, > > > + id = __rdma_create_id(listen_id->route.addr.dev_addr.net, > > > listen_id->event_handler, listen_id->context, > > > - listen_id->ps, > ib_event->param.req_rcvd.qp_type); > > > + listen_id->ps, ib_event->param.req_rcvd.qp_type, > > > + listen_id->caller); > > > > I think the cleanest way will be to create some struct and pass pointer to > it so > > you can unfold all relevant data inside of __rdma_create_id(). > > > > Why is that cleaner? Marshall up the data into a struct, pass a ptr, > unmarshall it all... I counted 6 arguments, and for me, it smells like something wrong. > > > > > if (IS_ERR(id)) > > > return NULL; > > > > > > @@ -1843,8 +1851,8 @@ static struct rdma_id_private > > *cma_new_udp_id(struct rdma_cm_id *listen_id, > > > struct net *net = listen_id->route.addr.dev_addr.net; > > > int ret; > > > > > > - id = rdma_create_id(net, listen_id->event_handler, > listen_id->context, > > > - listen_id->ps, IB_QPT_UD); > > > + id = __rdma_create_id(net, listen_id->event_handler, listen_id- > > >context, > > > + listen_id->ps, IB_QPT_UD, listen_id->caller); > > > if (IS_ERR(id)) > > > return NULL; > > > > > > @@ -2110,10 +2118,11 @@ static int iw_conn_req_handler(struct iw_cm_id > > *cm_id, > > > goto out; > > > > > > /* Create a new RDMA id for the new IW CM ID */ > > > - new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, > > > - listen_id->id.event_handler, > > > - listen_id->id.context, > > > - RDMA_PS_TCP, IB_QPT_RC); > > > + new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, > > > + listen_id->id.event_handler, > > > + listen_id->id.context, > > > + RDMA_PS_TCP, IB_QPT_RC, > > > + listen_id->id.caller); > > > if (IS_ERR(new_cm_id)) { > > > ret = -ENOMEM; > > > goto out; > > > @@ -2238,8 +2247,8 @@ static void cma_listen_on_dev(struct > > rdma_id_private *id_priv, > > > if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev- > > >device, 1)) > > > return; > > > > > > - id = rdma_create_id(net, cma_listen_handler, id_priv, > id_priv->id.ps, > > > - id_priv->id.qp_type); > > > + id = __rdma_create_id(net, cma_listen_handler, id_priv, > id_priv->id.ps, > > > + id_priv->id.qp_type, id_priv->id.caller); > > > if (IS_ERR(id)) > > > return; > > > > > > @@ -3347,8 +3356,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct > > sockaddr *addr) > > > > > > return 0; > > > err2: > > > - if (id_priv->cma_dev) > > > + if (id_priv->cma_dev) { > > > + rdma_restrack_del(&id_priv->id.res); > > > cma_release_dev(id_priv); > > > + } > > > err1: > > > cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); > > > return ret; > > > @@ -3731,14 +3742,18 @@ static int cma_send_sidr_rep(struct > > rdma_id_private *id_priv, > > > return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); > > > } > > > > > > -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > > *conn_param) > > > +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > > *conn_param, > > > + const char *caller) > > > { > > > struct rdma_id_private *id_priv; > > > int ret; > > > > > > id_priv = container_of(id, struct rdma_id_private, id); > > > > > > - id_priv->owner = task_pid_nr(current); > > > + if (caller) > > > + id_priv->id.caller = caller; > > > + else > > > + id_priv->owner = task_pid_nr(current); > > > > > > if (!cma_comp(id_priv, RDMA_CM_CONNECT)) > > > return -EINVAL; > > > @@ -3778,7 +3793,7 @@ int rdma_accept(struct rdma_cm_id *id, struct > > rdma_conn_param *conn_param) > > > rdma_reject(id, NULL, 0); > > > return ret; > > > } > > > -EXPORT_SYMBOL(rdma_accept); > > > +EXPORT_SYMBOL(__rdma_accept); > > > > > > int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) > > > { > > > diff --git a/drivers/infiniband/core/nldev.c > b/drivers/infiniband/core/nldev.c > > > index fa8655e..a4091b5 100644 > > > --- a/drivers/infiniband/core/nldev.c > > > +++ b/drivers/infiniband/core/nldev.c > > > @@ -34,6 +34,7 @@ > > > #include <linux/pid.h> > > > #include <linux/pid_namespace.h> > > > #include <net/netlink.h> > > > +#include <rdma/rdma_cm.h> > > > #include <rdma/rdma_netlink.h> > > > > > > #include "core_priv.h" > > > @@ -71,6 +72,22 @@ > > > [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, > > > [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = > > NLA_NUL_STRING, > > > .len = TASK_COMM_LEN }, > > > + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = > > NLA_NESTED }, > > > > I would like to use this opportunity. There is CM_ID, so users will be > > able to query nldev directly on this ID (once we implement .doit), but > > can we found a proper abstraction for other objects (PD, CQ, QP e.t.c.)? > > > > I want to have that all resources will have something similar to > ifindex/ibindex. > > > > Pds, cqs, and qps, all have a device-unique number. So > ibindex/restrack_type/object_id should work. But cm_id's don't have that. > Similar to a socket I guess. So I'm not sure how to identify cm_ids other > than by the ipaddresses/ip ports. It is opposite, cm_id is a unique number, but other objects don't have such. What about PD and CQ? We can declare that access to QP .doit willbe based on QPN, PD .doit will be based on local_dma_lkey, but what will be CQ identifier? Thanks
Hey Leon, > On Thu, Feb 01, 2018 at 10:07:20AM -0600, Steve Wise wrote: > > > > > > > > On Tue, Jan 30, 2018 at 08:59:11AM -0800, Steve Wise wrote: > > > > Implement RDMA nldev netlink interface to get detailed CM_ID > > information. > > > > > > > > Because cm_id's are attached to rdma devices in various work queue > > contexts, > > > > the pid and task information at device-attach time is sometimes not > > useful. > > > > For example, an nvme/f host connection ends up being bound to a device > > > > in a work queue context and the resulting pid at attach time no longer > > exists > > > > after connection setup. So instead we mark all cm_id's created via the > > > > rdma_ucm as "user", and all others as "kernel". This required tweaking > > > > the restrack code a little. It also required wrapping some rdma_cm > > > > functions to allow passing the module name string. > > > > > > > > Signed-off-by: Steve Wise <swise@opengridcomputing.com> > > > > --- > > > > drivers/infiniband/core/cma.c | 55 ++++++--- > > > > drivers/infiniband/core/nldev.c | 245 > > > +++++++++++++++++++++++++++++++++++++ > > > > drivers/infiniband/core/restrack.c | 29 ++++- > > > > drivers/infiniband/core/ucma.c | 8 +- > > > > include/rdma/rdma_cm.h | 24 +++- > > > > include/rdma/restrack.h | 4 + > > > > include/uapi/rdma/rdma_netlink.h | 30 +++++ > > > > 7 files changed, 365 insertions(+), 30 deletions(-) > > > > > > > > diff --git a/drivers/infiniband/core/cma.c > > b/drivers/infiniband/core/cma.c > > > > index 72ad52b..51fbfa1 100644 > > > > --- a/drivers/infiniband/core/cma.c > > > > +++ b/drivers/infiniband/core/cma.c > > > > @@ -465,6 +465,9 @@ static void _cma_attach_to_dev(struct > > > rdma_id_private *id_priv, > > > > id_priv->id.route.addr.dev_addr.transport = > > > > rdma_node_get_transport(cma_dev->device->node_type); > > > > list_add_tail(&id_priv->list, &cma_dev->id_list); > > > > + id_priv->id.res.type = RDMA_RESTRACK_CM_ID; > > > > + id_priv->id.res.kern_name = id_priv->id.caller; > > > > > > Steve, I don't like it, I worked hard to hide it from the users of > > restrack, > > > and don't see reason why the same trick as with ib_create_cq/ib_create_pd > > > won't > > > work here. > > > > I am doing the same trick, no? rdma_create_id() is a static inline that > > passes KBUILD_MODNAME. The issue is that at the time the rdma_cm_id is > > created, it is not associated with any ib_device. That only happens at > > cma_attach time. So how can the resource be added if there is no device? > > > > So maybe, we don't need to add resource to the DB at rdma_create_id > stage and do it in cma_attach only, and in that stage you will update > the kern_name with KBUILD_MODNAME. Yea, I'll look into this. > > > > > > > > + rdma_restrack_add(&id_priv->id.res); > > > > } > > > > > > > > static void cma_attach_to_dev(struct rdma_id_private *id_priv, > > > > @@ -737,10 +740,10 @@ static void cma_deref_id(struct rdma_id_private > > > *id_priv) > > > > complete(&id_priv->comp); > > > > } > > > > > > > > -struct rdma_cm_id *rdma_create_id(struct net *net, > > > > - rdma_cm_event_handler event_handler, > > > > - void *context, enum rdma_port_space ps, > > > > - enum ib_qp_type qp_type) > > > > +struct rdma_cm_id *__rdma_create_id(struct net *net, > > > > + rdma_cm_event_handler event_handler, > > > > + void *context, enum rdma_port_space ps, > > > > + enum ib_qp_type qp_type, const char > > *caller) > > > > { > > > > struct rdma_id_private *id_priv; > > > > > > > > @@ -748,7 +751,10 @@ struct rdma_cm_id *rdma_create_id(struct net > *net, > > > > if (!id_priv) > > > > return ERR_PTR(-ENOMEM); > > > > > > > > - id_priv->owner = task_pid_nr(current); > > > > + if (caller) > > > > + id_priv->id.caller = caller; > > > > + else > > > > + id_priv->owner = task_pid_nr(current); > > > > > > See the comment above > > > > There is no ib_device at this point, so caller (and owner) must be saved > > until the cm_id is bound to a device (or possibly devices for listening > > ids). > > Why do we need previous owner? Can it be that rdma_create_id was > performed by one process and cma_attach by another? Yes. Connection setup events are processed on rdma_cm (and ib_cm/iw_cm) workqueue threads. Here's one example: The application creates a cm_id, binds to 0.0.0.0/0 and listens. But there are no rdma devices at this point. There is a cm_id owned by the application, but not bound to any device. Then, lets say mlx4 and cxgb4 both get inserted. The rdma_cm will discover the new rdma devices, create and attach child cm_ids to those devices. This is done in a workq thread driven off of the ib_client device_add upcall. So what we really want to show is that these per-device cm_ids are owned by the same application. There are other cases, that I've seen with just nvme/f host. I'll produce more examples to help us understand if there is a better path than what I've proposed in this patch. > > > > > > > > > > id_priv->state = RDMA_CM_IDLE; > > > > id_priv->id.context = context; > > > > id_priv->id.event_handler = event_handler; > > > > > > Not saying that we need to do it now, but it is important to write, most > > > probably we can drop certain initialization from rdma_create_id() adn > > > reuse rdma_restrack_put/_get. > > > > > > > I don't understand this comment. Can you please elaborate? > > Most probably, we will be able to drop id_priv->qp_mutex in the future, > but let's not discuss it now, it is not related to this series. > > > > > > > @@ -768,7 +774,7 @@ struct rdma_cm_id *rdma_create_id(struct net > *net, > > > > > > > > return &id_priv->id; > > > > } > > > > -EXPORT_SYMBOL(rdma_create_id); > > > > +EXPORT_SYMBOL(__rdma_create_id); > > > > > > > > static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp > > *qp) > > > > { > > > > @@ -1628,6 +1634,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) > > > > mutex_unlock(&id_priv->handler_mutex); > > > > > > > > if (id_priv->cma_dev) { > > > > + rdma_restrack_del(&id_priv->id.res); > > > > > > You should count all created cm_ids and not only binded. > > > > No ib_device if they aren't bound. > > > > > > > > > if (rdma_cap_ib_cm(id_priv->id.device, 1)) { > > > > if (id_priv->cm_id.ib) > > > > ib_destroy_cm_id(id_priv->cm_id.ib); > > > > @@ -1786,9 +1793,10 @@ static struct rdma_id_private > > > *cma_new_conn_id(struct rdma_cm_id *listen_id, > > > > ib_event->param.req_rcvd.primary_path->service_id; > > > > int ret; > > > > > > > > - id = rdma_create_id(listen_id->route.addr.dev_addr.net, > > > > + id = __rdma_create_id(listen_id->route.addr.dev_addr.net, > > > > listen_id->event_handler, listen_id->context, > > > > - listen_id->ps, > > ib_event->param.req_rcvd.qp_type); > > > > + listen_id->ps, ib_event->param.req_rcvd.qp_type, > > > > + listen_id->caller); > > > > > > I think the cleanest way will be to create some struct and pass pointer to > > it so > > > you can unfold all relevant data inside of __rdma_create_id(). > > > > > > > Why is that cleaner? Marshall up the data into a struct, pass a ptr, > > unmarshall it all... > > I counted 6 arguments, and for me, it smells like something wrong. > I'll look into changing this. > > > > > > > > if (IS_ERR(id)) > > > > return NULL; > > > > > > > > @@ -1843,8 +1851,8 @@ static struct rdma_id_private > > > *cma_new_udp_id(struct rdma_cm_id *listen_id, > > > > struct net *net = listen_id->route.addr.dev_addr.net; > > > > int ret; > > > > > > > > - id = rdma_create_id(net, listen_id->event_handler, > > listen_id->context, > > > > - listen_id->ps, IB_QPT_UD); > > > > + id = __rdma_create_id(net, listen_id->event_handler, listen_id- > > > >context, > > > > + listen_id->ps, IB_QPT_UD, listen_id->caller); > > > > if (IS_ERR(id)) > > > > return NULL; > > > > > > > > @@ -2110,10 +2118,11 @@ static int iw_conn_req_handler(struct > iw_cm_id > > > *cm_id, > > > > goto out; > > > > > > > > /* Create a new RDMA id for the new IW CM ID */ > > > > - new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, > > > > - listen_id->id.event_handler, > > > > - listen_id->id.context, > > > > - RDMA_PS_TCP, IB_QPT_RC); > > > > + new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, > > > > + listen_id->id.event_handler, > > > > + listen_id->id.context, > > > > + RDMA_PS_TCP, IB_QPT_RC, > > > > + listen_id->id.caller); > > > > if (IS_ERR(new_cm_id)) { > > > > ret = -ENOMEM; > > > > goto out; > > > > @@ -2238,8 +2247,8 @@ static void cma_listen_on_dev(struct > > > rdma_id_private *id_priv, > > > > if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev- > > > >device, 1)) > > > > return; > > > > > > > > - id = rdma_create_id(net, cma_listen_handler, id_priv, > > id_priv->id.ps, > > > > - id_priv->id.qp_type); > > > > + id = __rdma_create_id(net, cma_listen_handler, id_priv, > > id_priv->id.ps, > > > > + id_priv->id.qp_type, id_priv->id.caller); > > > > if (IS_ERR(id)) > > > > return; > > > > > > > > @@ -3347,8 +3356,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, > struct > > > sockaddr *addr) > > > > > > > > return 0; > > > > err2: > > > > - if (id_priv->cma_dev) > > > > + if (id_priv->cma_dev) { > > > > + rdma_restrack_del(&id_priv->id.res); > > > > cma_release_dev(id_priv); > > > > + } > > > > err1: > > > > cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); > > > > return ret; > > > > @@ -3731,14 +3742,18 @@ static int cma_send_sidr_rep(struct > > > rdma_id_private *id_priv, > > > > return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); > > > > } > > > > > > > > -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > > > *conn_param) > > > > +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param > > > *conn_param, > > > > + const char *caller) > > > > { > > > > struct rdma_id_private *id_priv; > > > > int ret; > > > > > > > > id_priv = container_of(id, struct rdma_id_private, id); > > > > > > > > - id_priv->owner = task_pid_nr(current); > > > > + if (caller) > > > > + id_priv->id.caller = caller; > > > > + else > > > > + id_priv->owner = task_pid_nr(current); > > > > > > > > if (!cma_comp(id_priv, RDMA_CM_CONNECT)) > > > > return -EINVAL; > > > > @@ -3778,7 +3793,7 @@ int rdma_accept(struct rdma_cm_id *id, struct > > > rdma_conn_param *conn_param) > > > > rdma_reject(id, NULL, 0); > > > > return ret; > > > > } > > > > -EXPORT_SYMBOL(rdma_accept); > > > > +EXPORT_SYMBOL(__rdma_accept); > > > > > > > > int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) > > > > { > > > > diff --git a/drivers/infiniband/core/nldev.c > > b/drivers/infiniband/core/nldev.c > > > > index fa8655e..a4091b5 100644 > > > > --- a/drivers/infiniband/core/nldev.c > > > > +++ b/drivers/infiniband/core/nldev.c > > > > @@ -34,6 +34,7 @@ > > > > #include <linux/pid.h> > > > > #include <linux/pid_namespace.h> > > > > #include <net/netlink.h> > > > > +#include <rdma/rdma_cm.h> > > > > #include <rdma/rdma_netlink.h> > > > > > > > > #include "core_priv.h" > > > > @@ -71,6 +72,22 @@ > > > > [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, > > > > [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = > > > NLA_NUL_STRING, > > > > .len = TASK_COMM_LEN }, > > > > + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = > > > NLA_NESTED }, > > > > > > I would like to use this opportunity. There is CM_ID, so users will be > > > able to query nldev directly on this ID (once we implement .doit), but > > > can we found a proper abstraction for other objects (PD, CQ, QP e.t.c.)? > > > > > > I want to have that all resources will have something similar to > > ifindex/ibindex. > > > > > > > Pds, cqs, and qps, all have a device-unique number. So > > ibindex/restrack_type/object_id should work. But cm_id's don't have that. > > Similar to a socket I guess. So I'm not sure how to identify cm_ids other > > than by the ipaddresses/ip ports. > > It is opposite, cm_id is a unique number, but other objects don't have > such. What about PD and CQ? > > We can declare that access to QP .doit willbe based on QPN, PD .doit > will be based on local_dma_lkey, but what will be CQ identifier? > Now that I've thought about this more, I realize the core verbs data structures don't have identifiers for most objects. I'm not sure why the qp_num is exposed, but I guess it is because the qp num values must be exchanged between both ends of an IB connection to setup the connection. The local_dma_lkey value will not be unique for each PD. For all iwarp devices the local_dma_lkey is 0. (I think the same is true for IB devices). So PD has the same issue as CQ. I suppose we can use rkey for MRs, but if the MR is not in the VALID state, the rkey isn't necessarily accurate (the LSB of the rkey can be changed by the application with each REG_MR operation). The cm_id doesn't have any unique identifier either, except maybe the combination of src/dst addresses and port space. With iw_cxgb4, CQs, PDs, and MRs all have a unique identifier just like a QP does. I expect all hw implementations of rdma have numeric identifiers for CQs, PDs, QPs, etc. They aren't exposed to the core API though. Perhaps we add a device-unique (or globally unique) identifier as part of the restrack struct and use that for GET/SET? Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 05, 2018 at 09:33:07AM -0600, Steve Wise wrote: > Hey Leon, > > > > > It is opposite, cm_id is a unique number, but other objects don't have > > such. What about PD and CQ? > > > > We can declare that access to QP .doit willbe based on QPN, PD .doit > > will be based on local_dma_lkey, but what will be CQ identifier? > > > > Now that I've thought about this more, I realize the core verbs data > structures don't have identifiers for most objects. I'm not sure why the > qp_num is exposed, but I guess it is because the qp num values must be > exchanged between both ends of an IB connection to setup the connection. > > The local_dma_lkey value will not be unique for each PD. For all iwarp > devices the local_dma_lkey is 0. (I think the same is true for IB devices). > So PD has the same issue as CQ. > > I suppose we can use rkey for MRs, but if the MR is not in the VALID state, > the rkey isn't necessarily accurate (the LSB of the rkey can be changed by > the application with each REG_MR operation). > > The cm_id doesn't have any unique identifier either, except maybe the > combination of src/dst addresses and port space. > > With iw_cxgb4, CQs, PDs, and MRs all have a unique identifier just like a QP > does. I expect all hw implementations of rdma have numeric identifiers for > CQs, PDs, QPs, etc. They aren't exposed to the core API though. > > Perhaps we add a device-unique (or globally unique) identifier as part of > the restrack struct and use that for GET/SET? +1, I like it, it is very clean. Thanks > > Steve. >
> > > > > > It is opposite, cm_id is a unique number, but other objects don't have > > > such. What about PD and CQ? > > > > > > We can declare that access to QP .doit willbe based on QPN, PD .doit > > > will be based on local_dma_lkey, but what will be CQ identifier? > > > > > > > Now that I've thought about this more, I realize the core verbs data > > structures don't have identifiers for most objects. I'm not sure why the > > qp_num is exposed, but I guess it is because the qp num values must be > > exchanged between both ends of an IB connection to setup the connection. > > > > The local_dma_lkey value will not be unique for each PD. For all iwarp > > devices the local_dma_lkey is 0. (I think the same is true for IB devices). > > So PD has the same issue as CQ. > > > > I suppose we can use rkey for MRs, but if the MR is not in the VALID state, > > the rkey isn't necessarily accurate (the LSB of the rkey can be changed by > > the application with each REG_MR operation). > > > > The cm_id doesn't have any unique identifier either, except maybe the > > combination of src/dst addresses and port space. > > > > With iw_cxgb4, CQs, PDs, and MRs all have a unique identifier just like a QP > > does. I expect all hw implementations of rdma have numeric identifiers for > > CQs, PDs, QPs, etc. They aren't exposed to the core API though. > > > > Perhaps we add a device-unique (or globally unique) identifier as part of > > the restrack struct and use that for GET/SET? > > +1, > I like it, it is very clean. > > Thanks This should be done to a subsequent series. Perhaps when GET/SET are actually implemented... Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Hey Leon, > ... > > > > > diff --git a/drivers/infiniband/core/cma.c > > > b/drivers/infiniband/core/cma.c > > > > > index 72ad52b..51fbfa1 100644 > > > > > --- a/drivers/infiniband/core/cma.c > > > > > +++ b/drivers/infiniband/core/cma.c > > > > > @@ -465,6 +465,9 @@ static void _cma_attach_to_dev(struct > > > > rdma_id_private *id_priv, > > > > > id_priv->id.route.addr.dev_addr.transport = > > > > > rdma_node_get_transport(cma_dev->device->node_type); > > > > > list_add_tail(&id_priv->list, &cma_dev->id_list); > > > > > + id_priv->id.res.type = RDMA_RESTRACK_CM_ID; > > > > > + id_priv->id.res.kern_name = id_priv->id.caller; > > > > > > > > Steve, I don't like it, I worked hard to hide it from the users of > > > restrack, > > > > and don't see reason why the same trick as with > ib_create_cq/ib_create_pd > > > > won't > > > > work here. > > > > > > I am doing the same trick, no? rdma_create_id() is a static inline that > > > passes KBUILD_MODNAME. The issue is that at the time the rdma_cm_id is > > > created, it is not associated with any ib_device. That only happens at > > > cma_attach time. So how can the resource be added if there is no device? > > > > > > > So maybe, we don't need to add resource to the DB at rdma_create_id > > stage and do it in cma_attach only, and in that stage you will update > > the kern_name with KBUILD_MODNAME. > > Yea, I'll look into this. The current patch is only adding the resource in _cma_attach_to_dev(). And using KBUILD_MODNAME in _cma_attach_to_dev() would always end up with "[rdma_cm]". That's why the caller string is saved in rdma_create_id() static inline function; so it will get the true module name. I don't see any change needed here. Thanks, Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > > > > @@ -1786,9 +1793,10 @@ static struct rdma_id_private > > > > *cma_new_conn_id(struct rdma_cm_id *listen_id, > > > > > ib_event->param.req_rcvd.primary_path->service_id; > > > > > int ret; > > > > > > > > > > - id = rdma_create_id(listen_id->route.addr.dev_addr.net, > > > > > + id = __rdma_create_id(listen_id->route.addr.dev_addr.net, > > > > > listen_id->event_handler, listen_id->context, > > > > > - listen_id->ps, > > > ib_event->param.req_rcvd.qp_type); > > > > > + listen_id->ps, ib_event- > >param.req_rcvd.qp_type, > > > > > + listen_id->caller); > > > > > > > > I think the cleanest way will be to create some struct and pass pointer to > > > it so > > > > you can unfold all relevant data inside of __rdma_create_id(). > > > > > > > > > > Why is that cleaner? Marshall up the data into a struct, pass a ptr, > > > unmarshall it all... > > > > I counted 6 arguments, and for me, it smells like something wrong. > > > > I'll look into changing this. > Changing this will force changing all the applications using rdma_create_id(). I'd rather not do that as part of this series. It dilutes the subject of the series. Does anyone else care either way? Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 05, 2018 at 09:33:07AM -0600, Steve Wise wrote: > Perhaps we add a device-unique (or globally unique) identifier as part of > the restrack struct and use that for GET/SET? What use is an ID if the user can't associate that with something meaningful? For userspace, a tuple of the filehandle and existing the per-file handle # would make some sense as a unique ID, but userspace is going to have a pointer not necessarily the handle number :( Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Mon, Feb 05, 2018 at 09:33:07AM -0600, Steve Wise wrote: > > > Perhaps we add a device-unique (or globally unique) identifier as part of > > the restrack struct and use that for GET/SET? > > What use is an ID if the user can't associate that with something > meaningful? It uniquely identifies the widget you want to GET or SET. Ideally, having it be something meaningful is nice, but if we want to support GET/SET on various single objects, we need identifiers. And assigning them as part of restrack would do the trick. However, I don't see GET/SET being useful for QP, CQ, PD, CM_ID, nor MR resources. Just DUMP. I do see perhaps devices and links having GET/SET. For instance setting up stuff for RXE and SIW soft devices. > > For userspace, a tuple of the filehandle and existing the per-file > handle # would make some sense as a unique ID, but userspace is going > to have a pointer not necessarily the handle number :( > I'm not following you here. Who does filehandle/handle# pertain to rdmatool fetching objects via netlnk and possibly setting them? Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 05, 2018 at 02:28:02PM -0600, Steve Wise wrote: > > On Mon, Feb 05, 2018 at 09:33:07AM -0600, Steve Wise wrote: > > > > > Perhaps we add a device-unique (or globally unique) identifier as part > of > > > the restrack struct and use that for GET/SET? > > > > What use is an ID if the user can't associate that with something > > meaningful? > > It uniquely identifies the widget you want to GET or SET. Ideally, having > it be something meaningful is nice, but if we want to support GET/SET on > various single objects, we need identifiers. And assigning them as part of > restrack would do the trick. > > However, I don't see GET/SET being useful for QP, CQ, PD, CM_ID, nor MR > resources. Just DUMP. I do see perhaps devices and links having GET/SET. > For instance setting up stuff for RXE and SIW soft devices. Well exactly.. I don't expect GET/SET for restrack objects.. > > For userspace, a tuple of the filehandle and existing the per-file > > handle # would make some sense as a unique ID, but userspace is going > > to have a pointer not necessarily the handle number :( > > I'm not following you here. Who does filehandle/handle# pertain to rdmatool > fetching objects via netlnk and possibly setting them? It is the other side of the question - when rdmatool presents me an object how do I figure out what it is in 'my world' as userspace. Like a PD for instance.. If I have two in a process how do I know which is which compared to a dump? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > > > For userspace, a tuple of the filehandle and existing the per-file > > > handle # would make some sense as a unique ID, but userspace is going > > > to have a pointer not necessarily the handle number :( > > > > I'm not following you here. Who does filehandle/handle# pertain to rdmatool > > fetching objects via netlnk and possibly setting them? > > It is the other side of the question - when rdmatool presents me an > object how do I figure out what it is in 'my world' as userspace. > > Like a PD for instance.. If I have two in a process how do I know > which is which compared to a dump? You don't. :) As it stands now, you can tell only if a given PD is in your process or not based on the process id. Even if we create a device-unique ID for each restrack object, to make it useful to for userspace analysis would require exposing that ID in the various user object structures (or via query methods). EG having pd->res_id, cq->res_id, etc... Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 05, 2018 at 02:53:23PM -0600, Steve Wise wrote: > Even if we create a device-unique ID for each restrack object, to make it > useful to for userspace analysis would require exposing that ID in the > various user object structures (or via query methods). EG having > pd->res_id, cq->res_id, etc... As I said, we have that, it is the fd, handle # tuple and most verbs objects expose the handle # in the user visible part of the object struct.. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On Mon, Feb 05, 2018 at 02:53:23PM -0600, Steve Wise wrote: > > > Even if we create a device-unique ID for each restrack object, to make it > > useful to for userspace analysis would require exposing that ID in the > > various user object structures (or via query methods). EG having > > pd->res_id, cq->res_id, etc... > > As I said, we have that, it is the fd, handle # tuple and most verbs > objects expose the handle # in the user visible part of the object > struct.. Can you please show me an example of the handle # for some object like PD or CQ? This doesn't address kernel users' object, though, correct? Thanks, Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 05, 2018 at 04:16:37PM -0600, Steve Wise wrote: > > > > On Mon, Feb 05, 2018 at 02:53:23PM -0600, Steve Wise wrote: > > > > > Even if we create a device-unique ID for each restrack object, to make > it > > > useful to for userspace analysis would require exposing that ID in the > > > various user object structures (or via query methods). EG having > > > pd->res_id, cq->res_id, etc... > > > > As I said, we have that, it is the fd, handle # tuple and most verbs > > objects expose the handle # in the user visible part of the object > > struct.. > > Can you please show me an example of the handle # for some object like PD or > CQ? From rdma-core verbs.h: struct ibv_pd { struct ibv_context *context; uint32_t handle; }; > This doesn't address kernel users' object, though, correct? Right. And id'ing a file descriptor gets kinda hard too.. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > > > As I said, we have that, it is the fd, handle # tuple and most verbs > > objects expose the handle # in the user visible part of the object > > struct.. > > Can you please show me an example of the handle # for some object like PD or > CQ? This doesn't address kernel users' object, though, correct? Never mind, I see: ibv_pd.handle, ibv_mr.handle, etc... -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 05, 2018 at 03:20:25PM -0700, Jason Gunthorpe wrote: > On Mon, Feb 05, 2018 at 04:16:37PM -0600, Steve Wise wrote: > > > > > > On Mon, Feb 05, 2018 at 02:53:23PM -0600, Steve Wise wrote: > > > > > > > Even if we create a device-unique ID for each restrack object, to make > > it > > > > useful to for userspace analysis would require exposing that ID in the > > > > various user object structures (or via query methods). EG having > > > > pd->res_id, cq->res_id, etc... > > > > > > As I said, we have that, it is the fd, handle # tuple and most verbs > > > objects expose the handle # in the user visible part of the object > > > struct.. > > > > Can you please show me an example of the handle # for some object like PD or > > CQ? > > From rdma-core verbs.h: > > struct ibv_pd { > struct ibv_context *context; > uint32_t handle; > }; > > > This doesn't address kernel users' object, though, correct? > > Right. > > And id'ing a file descriptor gets kinda hard too.. And how should we handle kernel objects? > > Jason
On Tue, Feb 06, 2018 at 10:40:19AM +0200, Leon Romanovsky wrote: > On Mon, Feb 05, 2018 at 03:20:25PM -0700, Jason Gunthorpe wrote: > > On Mon, Feb 05, 2018 at 04:16:37PM -0600, Steve Wise wrote: > > > > > > > > On Mon, Feb 05, 2018 at 02:53:23PM -0600, Steve Wise wrote: > > > > > > > > > Even if we create a device-unique ID for each restrack object, to make > > > it > > > > > useful to for userspace analysis would require exposing that ID in the > > > > > various user object structures (or via query methods). EG having > > > > > pd->res_id, cq->res_id, etc... > > > > > > > > As I said, we have that, it is the fd, handle # tuple and most verbs > > > > objects expose the handle # in the user visible part of the object > > > > struct.. > > > > > > Can you please show me an example of the handle # for some object like PD or > > > CQ? > > > > From rdma-core verbs.h: > > > > struct ibv_pd { > > struct ibv_context *context; > > uint32_t handle; > > }; > > > > > This doesn't address kernel users' object, though, correct? > > > > Right. > > > > And id'ing a file descriptor gets kinda hard too.. > > And how should we handle kernel objects? hash'd pointer to internal struct? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 72ad52b..51fbfa1 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -465,6 +465,9 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv, id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); + id_priv->id.res.type = RDMA_RESTRACK_CM_ID; + id_priv->id.res.kern_name = id_priv->id.caller; + rdma_restrack_add(&id_priv->id.res); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, @@ -737,10 +740,10 @@ static void cma_deref_id(struct rdma_id_private *id_priv) complete(&id_priv->comp); } -struct rdma_cm_id *rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, - enum ib_qp_type qp_type) +struct rdma_cm_id *__rdma_create_id(struct net *net, + rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *id_priv; @@ -748,7 +751,10 @@ struct rdma_cm_id *rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - id_priv->owner = task_pid_nr(current); + if (caller) + id_priv->id.caller = caller; + else + id_priv->owner = task_pid_nr(current); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; @@ -768,7 +774,7 @@ struct rdma_cm_id *rdma_create_id(struct net *net, return &id_priv->id; } -EXPORT_SYMBOL(rdma_create_id); +EXPORT_SYMBOL(__rdma_create_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { @@ -1628,6 +1634,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_unlock(&id_priv->handler_mutex); if (id_priv->cma_dev) { + rdma_restrack_del(&id_priv->id.res); if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); @@ -1786,9 +1793,10 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, ib_event->param.req_rcvd.primary_path->service_id; int ret; - id = rdma_create_id(listen_id->route.addr.dev_addr.net, + id = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, - listen_id->ps, ib_event->param.req_rcvd.qp_type); + listen_id->ps, ib_event->param.req_rcvd.qp_type, + listen_id->caller); if (IS_ERR(id)) return NULL; @@ -1843,8 +1851,8 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, struct net *net = listen_id->route.addr.dev_addr.net; int ret; - id = rdma_create_id(net, listen_id->event_handler, listen_id->context, - listen_id->ps, IB_QPT_UD); + id = __rdma_create_id(net, listen_id->event_handler, listen_id->context, + listen_id->ps, IB_QPT_UD, listen_id->caller); if (IS_ERR(id)) return NULL; @@ -2110,10 +2118,11 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, - listen_id->id.event_handler, - listen_id->id.context, - RDMA_PS_TCP, IB_QPT_RC); + new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, + listen_id->id.context, + RDMA_PS_TCP, IB_QPT_RC, + listen_id->id.caller); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; @@ -2238,8 +2247,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return; - id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, - id_priv->id.qp_type); + id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, + id_priv->id.qp_type, id_priv->id.caller); if (IS_ERR(id)) return; @@ -3347,8 +3356,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (id_priv->cma_dev) + if (id_priv->cma_dev) { + rdma_restrack_del(&id_priv->id.res); cma_release_dev(id_priv); + } err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; @@ -3731,14 +3742,18 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + const char *caller) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); - id_priv->owner = task_pid_nr(current); + if (caller) + id_priv->id.caller = caller; + else + id_priv->owner = task_pid_nr(current); if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; @@ -3778,7 +3793,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) rdma_reject(id, NULL, 0); return ret; } -EXPORT_SYMBOL(rdma_accept); +EXPORT_SYMBOL(__rdma_accept); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index fa8655e..a4091b5 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -34,6 +34,7 @@ #include <linux/pid.h> #include <linux/pid_namespace.h> #include <net/netlink.h> +#include <rdma/rdma_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" @@ -71,6 +72,22 @@ [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN }, + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_IPV4_SADDR] = { + .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [RDMA_NLDEV_ATTR_RES_IPV4_DADDR] = { + .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [RDMA_NLDEV_ATTR_RES_IPV6_SADDR] = { + .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, + [RDMA_NLDEV_ATTR_RES_IPV6_DADDR] = { + .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, + [RDMA_NLDEV_ATTR_RES_IP_SPORT] = { .type = NLA_U16 }, + [RDMA_NLDEV_ATTR_RES_IP_DPORT] = { .type = NLA_U16 }, + [RDMA_NLDEV_ATTR_RES_DEV_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_NETWORK_TYPE] = { .type = NLA_U8 }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -182,6 +199,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) [RDMA_RESTRACK_PD] = "pd", [RDMA_RESTRACK_CQ] = "cq", [RDMA_RESTRACK_QP] = "qp", + [RDMA_RESTRACK_CM_ID] = "cm_id", }; struct rdma_restrack_root *res = &device->res; @@ -284,6 +302,99 @@ static int fill_res_qp_entry(struct sk_buff *msg, return -EMSGSIZE; } +static int fill_res_cm_id_entry(struct sk_buff *msg, + struct rdma_cm_id *cm_id, uint32_t port) +{ + struct rdma_id_private *id_priv; + struct nlattr *entry_attr; + + if (port && port != cm_id->port_num) + return 0; + + id_priv = container_of(cm_id, struct rdma_id_private, id); + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY); + if (!entry_attr) + goto out; + + if (cm_id->port_num && + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) + goto err; + + if (id_priv->qp_num && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps)) + goto err; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type)) + goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state)) + goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_DEV_TYPE, + id_priv->id.route.addr.dev_addr.dev_type)) + goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE, + id_priv->id.route.addr.dev_addr.transport)) + goto err; + + if (cm_id->route.addr.src_addr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&cm_id->route.addr.src_addr; + if (nla_put_in_addr(msg, RDMA_NLDEV_ATTR_RES_IPV4_SADDR, + sin->sin_addr.s_addr)) + goto err; + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_SPORT, + sin->sin_port)) + goto err; + + sin = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; + if (nla_put_in_addr(msg, RDMA_NLDEV_ATTR_RES_IPV4_DADDR, + sin->sin_addr.s_addr)) + goto err; + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_DPORT, + sin->sin_port)) + goto err; + } else { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&cm_id->route.addr.src_addr; + if (nla_put_in6_addr(msg, RDMA_NLDEV_ATTR_RES_IPV6_SADDR, + &sin6->sin6_addr)) + goto err; + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_SPORT, + sin6->sin6_port)) + goto err; + + sin6 = (struct sockaddr_in6 *)&cm_id->route.addr.dst_addr; + if (nla_put_in6_addr(msg, RDMA_NLDEV_ATTR_RES_IPV6_DADDR, + &sin6->sin6_addr)) + goto err; + if (nla_put_net16(msg, RDMA_NLDEV_ATTR_RES_IP_DPORT, + sin6->sin6_port)) + goto err; + } + + if (id_priv->id.caller) { + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, + id_priv->id.caller)) + goto err; + } else { + /* CMA keeps the owning pid. */ + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, id_priv->owner)) + goto err; + } + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); +out: + return -EMSGSIZE; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -686,6 +797,137 @@ static int nldev_res_get_qp_dumpit(struct sk_buff *skb, return ret; } +static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + int err, ret = 0, idx = 0; + struct nlattr *table_attr; + struct ib_device *device; + int start = cb->args[0]; + struct rdma_cm_id *cm_id = NULL; + struct nlmsghdr *nlh; + u32 index, port = 0; + + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + /* + * Right now, we are expecting the device index to get QP information, + * but it is possible to extend this code to return all devices in + * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. + * if it doesn't exist, we will iterate over all devices. + * + * But it is not needed for now. + */ + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + /* + * If no PORT_INDEX is supplied, we will return all QPs from that device + */ + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err_index; + } + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_QP_GET), + 0, NLM_F_MULTI); + + if (fill_nldev_handle(skb, device)) { + ret = -EMSGSIZE; + goto err; + } + + table_attr = nla_nest_start(skb, RDMA_NLDEV_ATTR_RES_CM_ID); + if (!table_attr) { + ret = -EMSGSIZE; + goto err; + } + + down_read(&device->res.rwsem); + hash_for_each_possible(device->res.hash, res, node, + RDMA_RESTRACK_CM_ID) { + if (idx < start) + goto next; + + if ((rdma_is_kernel_res(res) && + task_active_pid_ns(current) != &init_pid_ns) || + (!rdma_is_kernel_res(res) && + task_active_pid_ns(current) != + task_active_pid_ns(res->task))) + /* + * 1. Kernel QPs should be visible in init namsapce only + * 2. Preent only QPs visible in the current namespace + */ + goto next; + + if (!rdma_restrack_get(res)) + /* + * Resource is under release now, but we are not + * relesing lock now, so it will be released in + * our next pass, once we will get ->next pointer. + */ + goto next; + + cm_id = container_of(res, struct rdma_cm_id, res); + + up_read(&device->res.rwsem); + ret = fill_res_cm_id_entry(skb, cm_id, port); + down_read(&device->res.rwsem); + /* + * Return resource back, but it won't be released till + * the &device->res.rwsem will be released for write. + */ + rdma_restrack_put(res); + + if (ret == -EMSGSIZE) + /* + * There is a chance to optimize here. + * It can be done by using list_prepare_entry + * and list_for_each_entry_continue afterwards. + */ + break; + if (ret) + goto res_err; +next: idx++; + } + up_read(&device->res.rwsem); + + nla_nest_end(skb, table_attr); + nlmsg_end(skb, nlh); + cb->args[0] = idx; + + /* + * No more CM_IDs to fill, cancel the message and + * return 0 to mark end of dumpit. + */ + if (!cm_id) + goto err; + + put_device(&device->dev); + return skb->len; + +res_err: + nla_nest_cancel(skb, table_attr); + up_read(&device->res.rwsem); + +err: + nlmsg_cancel(skb, nlh); + +err_index: + put_device(&device->dev); + return ret; +} static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -712,6 +954,9 @@ static int nldev_res_get_qp_dumpit(struct sk_buff *skb, * too. */ }, + [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { + .dump = nldev_res_get_cm_id_dumpit, + }, }; void __init nldev_init(void) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 857637b..bb13169 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -3,6 +3,7 @@ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. */ +#include <rdma/rdma_cm.h> #include <rdma/ib_verbs.h> #include <rdma/restrack.h> #include <linux/mutex.h> @@ -67,6 +68,7 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; + struct rdma_cm_id *cm_id; switch (type) { case RDMA_RESTRACK_PD: @@ -85,6 +87,10 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) xrcd = container_of(res, struct ib_xrcd, res); dev = xrcd->device; break; + case RDMA_RESTRACK_CM_ID: + cm_id = container_of(res, struct rdma_cm_id, res); + dev = cm_id->device; + break; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", type); return NULL; @@ -93,6 +99,27 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return dev; } +static bool res_is_user(struct rdma_restrack_entry *res) +{ + enum rdma_restrack_type type = res->type; + bool is_user; + + switch (type) { + case RDMA_RESTRACK_CM_ID: { + struct rdma_cm_id *cm_id; + + cm_id = container_of(res, struct rdma_cm_id, res); + is_user = !cm_id->caller; + break; + } + default: + is_user = !uaccess_kernel(); + break; + } + + return is_user; +} + void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); @@ -100,7 +127,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) if (!dev) return; - if (!uaccess_kernel()) { + if (res_is_user(res)) { get_task_struct(current); res->task = current; res->kern_name = NULL; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index d67219d..f7f0282 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -476,8 +476,8 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, return -ENOMEM; ctx->uid = cmd.uid; - ctx->cm_id = rdma_create_id(current->nsproxy->net_ns, - ucma_event_handler, ctx, cmd.ps, qp_type); + ctx->cm_id = __rdma_create_id(current->nsproxy->net_ns, + ucma_event_handler, ctx, cmd.ps, qp_type, NULL); if (IS_ERR(ctx->cm_id)) { ret = PTR_ERR(ctx->cm_id); goto err1; @@ -1084,12 +1084,12 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&file->mut); - ret = rdma_accept(ctx->cm_id, &conn_param); + ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); if (!ret) ctx->uid = cmd.uid; mutex_unlock(&file->mut); } else - ret = rdma_accept(ctx->cm_id, NULL); + ret = __rdma_accept(ctx->cm_id, NULL, NULL); ucma_put_ctx(ctx); return ret; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 5984225..73dac9b 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -155,6 +155,12 @@ struct rdma_cm_id { enum rdma_port_space ps; enum ib_qp_type qp_type; u8 port_num; + const char *caller; + + /* + * Internal to RDMA/core, don't use in the drivers + */ + struct rdma_restrack_entry res; }; struct rdma_id_private { @@ -198,6 +204,11 @@ struct rdma_id_private { enum ib_gid_type gid_type; }; +struct rdma_cm_id *__rdma_create_id(struct net *net, + rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type, const char *caller); + /** * rdma_create_id - Create an RDMA identifier. * @@ -210,10 +221,9 @@ struct rdma_id_private { * * The id holds a reference on the network namespace until it is destroyed. */ -struct rdma_cm_id *rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, - enum ib_qp_type qp_type); +#define rdma_create_id(net, event_handler, context, ps, qp_type) \ + __rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \ + KBUILD_MODNAME) /** * rdma_destroy_id - Destroys an RDMA identifier. @@ -325,6 +335,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, */ int rdma_listen(struct rdma_cm_id *id, int backlog); +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + const char *caller); + /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. @@ -340,7 +353,8 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. */ -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); +#define rdma_accept(id, conn_param) \ + __rdma_accept((id), (conn_param), KBUILD_MODNAME) /** * rdma_notify - Notifies the RDMA CM of an asynchronous event that has diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index c2d8116..a794e0e 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -33,6 +33,10 @@ enum rdma_restrack_type { */ RDMA_RESTRACK_XRCD, /** + * @RDMA_RESTRACK_CM_ID: Connection Manager ID (CM_ID) + */ + RDMA_RESTRACK_CM_ID, + /** * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations */ RDMA_RESTRACK_MAX diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 17e59be..4ed21ee 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -240,6 +240,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */ + RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -352,6 +354,34 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_RES_KERN_NAME, /* string */ + RDMA_NLDEV_ATTR_RES_CM_ID, /* nested table */ + RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, /* nested table */ + /* + * rdma_cm_id port space. + */ + RDMA_NLDEV_ATTR_RES_PS, /* u32 */ + /* + * IP Addresses/port attributes. + */ + RDMA_NLDEV_ATTR_RES_IPV4_SADDR, /* u8[4] */ + RDMA_NLDEV_ATTR_RES_IPV4_DADDR, /* u8[4] */ + RDMA_NLDEV_ATTR_RES_IPV6_SADDR, /* u8[16] */ + RDMA_NLDEV_ATTR_RES_IPV6_DADDR, /* u8[16] */ + RDMA_NLDEV_ATTR_RES_IP_SPORT, /* BE u16 */ + RDMA_NLDEV_ATTR_RES_IP_DPORT, /* BE u16 */ + /* + * ARPHRD_INFINIBAND, ARPHRD_ETHER, ... + */ + RDMA_NLDEV_ATTR_RES_DEV_TYPE, /* u8 */ + /* + * enum enum rdma_transport_type (IB, IWARP, ...) + */ + RDMA_NLDEV_ATTR_RES_TRANSPORT_TYPE, /* u8 */ + /* + * enum rdma_network_type (IB, IPv4, IPv6,...) + */ + RDMA_NLDEV_ATTR_RES_NETWORK_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */
Implement RDMA nldev netlink interface to get detailed CM_ID information. Because cm_id's are attached to rdma devices in various work queue contexts, the pid and task information at device-attach time is sometimes not useful. For example, an nvme/f host connection ends up being bound to a device in a work queue context and the resulting pid at attach time no longer exists after connection setup. So instead we mark all cm_id's created via the rdma_ucm as "user", and all others as "kernel". This required tweaking the restrack code a little. It also required wrapping some rdma_cm functions to allow passing the module name string. Signed-off-by: Steve Wise <swise@opengridcomputing.com> --- drivers/infiniband/core/cma.c | 55 ++++++--- drivers/infiniband/core/nldev.c | 245 +++++++++++++++++++++++++++++++++++++ drivers/infiniband/core/restrack.c | 29 ++++- drivers/infiniband/core/ucma.c | 8 +- include/rdma/rdma_cm.h | 24 +++- include/rdma/restrack.h | 4 + include/uapi/rdma/rdma_netlink.h | 30 +++++ 7 files changed, 365 insertions(+), 30 deletions(-)