diff mbox

[v4,4/4] IB/sa: Route SA pathrecord query through netlink

Message ID 1433861837-26177-5-git-send-email-kaike.wan@intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Wan, Kaike June 9, 2015, 2:57 p.m. UTC
From: Kaike Wan <kaike.wan@intel.com>

This patch routes a SA pathrecord query to netlink first and processes the
response appropriately. If a failure is returned, the request will be sent
through IB. The decision whether to route the request to netlink first is
determined by the presence of a listener for the local service netlink
multicast group. If the user-space local service netlink multicast group
listener is not present, the request will be sent through IB, just like
what is currently being done.

Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: John Fleck <john.fleck@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
---
 drivers/infiniband/core/sa_query.c |  515 +++++++++++++++++++++++++++++++++++-
 include/uapi/rdma/rdma_netlink.h   |    2 +-
 2 files changed, 515 insertions(+), 2 deletions(-)

Comments

Hal Rosenstock June 10, 2015, 5:47 p.m. UTC | #1
On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote:
> From: Kaike Wan <kaike.wan@intel.com>
> 
> This patch routes a SA pathrecord query to netlink first

Should only unicast PRs be done in this manner or should API support
enabling for unicast and/or multicast ?

AFAIK kernel doesn't query multicast PRs now (queries MCMRs) but this
seems like it would help make it future proof and not have to take
timeout on local query unless app supports it.

> and processes the
> response appropriately. If a failure is returned, the request will be sent
> through IB. The decision whether to route the request to netlink first is
> determined by the presence of a listener for the local service netlink
> multicast group. If the user-space local service netlink multicast group
> listener is not present, the request will be sent through IB, just like
> what is currently being done.
> 
> Signed-off-by: Kaike Wan <kaike.wan@intel.com>
> Signed-off-by: John Fleck <john.fleck@intel.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> ---
>  drivers/infiniband/core/sa_query.c |  515 +++++++++++++++++++++++++++++++++++-
>  include/uapi/rdma/rdma_netlink.h   |    2 +-
>  2 files changed, 515 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
> index 17e1cf7..e063593 100644
> --- a/drivers/infiniband/core/sa_query.c
> +++ b/drivers/infiniband/core/sa_query.c
> @@ -45,12 +45,21 @@
>  #include <uapi/linux/if_ether.h>
>  #include <rdma/ib_pack.h>
>  #include <rdma/ib_cache.h>
> +#include <rdma/rdma_netlink.h>
> +#include <net/netlink.h>
> +#include <uapi/rdma/ib_user_sa.h>
> +#include <rdma/ib_marshall.h>
>  #include "sa.h"
>  
>  MODULE_AUTHOR("Roland Dreier");
>  MODULE_DESCRIPTION("InfiniBand subnet administration query support");
>  MODULE_LICENSE("Dual BSD/GPL");
>  
> +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
> +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
> +#define IB_SA_LOCAL_SVC_TIMEOUT_MAX		200000
> +static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
> +
>  struct ib_sa_sm_ah {
>  	struct ib_ah        *ah;
>  	struct kref          ref;
> @@ -80,8 +89,24 @@ struct ib_sa_query {
>  	struct ib_mad_send_buf *mad_buf;
>  	struct ib_sa_sm_ah     *sm_ah;
>  	int			id;
> +	u32			flags;
>  };
>  
> +#define IB_SA_ENABLE_LOCAL_SERVICE	0x00000001
> +#define IB_SA_CANCEL			0x00000002
> +
> +#define IB_SA_LOCAL_SVC_ENABLED(query) \
> +	((query)->flags & IB_SA_ENABLE_LOCAL_SERVICE)
> +#define IB_SA_ENABLE_LOCAL_SVC(query) \
> +	((query)->flags |= IB_SA_ENABLE_LOCAL_SERVICE)
> +#define IB_SA_DISABLE_LOCAL_SVC(query) \
> +	((query)->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE)
> +
> +#define IB_SA_QUERY_CANCELLED(query) \
> +	((query)->flags & IB_SA_CANCEL)
> +#define IB_SA_CANCEL_QUERY(query) \
> +	((query)->flags |= IB_SA_CANCEL)
> +
>  struct ib_sa_service_query {
>  	void (*callback)(int, struct ib_sa_service_rec *, void *);
>  	void *context;
> @@ -106,6 +131,26 @@ struct ib_sa_mcmember_query {
>  	struct ib_sa_query sa_query;
>  };
>  
> +struct ib_nl_request_info {
> +	struct list_head list;
> +	u32 seq;
> +	unsigned long timeout;
> +	struct ib_sa_query *query;
> +};
> +
> +struct ib_nl_attr_info {
> +	u16 len;	/* Total attr len: header + payload + padding */
> +	ib_sa_comp_mask comp_mask;
> +	void *input;
> +	void (*set_attrs)(struct sk_buff *skb, struct ib_nl_attr_info *info);
> +};
> +
> +static LIST_HEAD(ib_nl_request_list);
> +static DEFINE_SPINLOCK(ib_nl_request_lock);
> +static atomic_t ib_nl_sa_request_seq;
> +static struct workqueue_struct *ib_nl_wq;
> +static struct delayed_work ib_nl_timed_work;
> +
>  static void ib_sa_add_one(struct ib_device *device);
>  static void ib_sa_remove_one(struct ib_device *device);
>  
> @@ -381,6 +426,433 @@ static const struct ib_field guidinfo_rec_table[] = {
>  	  .size_bits    = 512 },
>  };
>  
> +static int ib_nl_send_msg(int opcode, struct ib_nl_attr_info *attrs, u32 seq)
> +{
> +	struct sk_buff *skb = NULL;
> +	struct nlmsghdr *nlh;
> +	void *data;
> +	int ret = 0;
> +
> +	if (attrs->len <= 0)
> +		return -EMSGSIZE;
> +
> +	skb = nlmsg_new(attrs->len, GFP_KERNEL);
> +	if (!skb) {
> +		pr_err("alloc failed ret=%d\n", ret);
> +		return -ENOMEM;
> +	}
> +
> +	/* Put nlmsg header only for now */
> +	data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_SA,
> +			    opcode, GFP_KERNEL);
> +	if (!data) {
> +		kfree_skb(skb);
> +		return -EMSGSIZE;
> +	}
> +
> +	/* Add attributes */
> +	attrs->set_attrs(skb, attrs);
> +
> +	/* Repair the nlmsg header length */
> +	nlmsg_end(skb, nlh);
> +
> +	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
> +	if (!ret) {
> +		ret = attrs->len;
> +	} else {
> +		if (ret != -ESRCH)
> +			pr_err("ibnl_multicast failed l=%d, r=%d\n",
> +			       attrs->len, ret);
> +		ret = 0;
> +	}
> +	return ret;
> +}
> +
> +static struct ib_nl_request_info *
> +ib_nl_alloc_request(struct ib_sa_query *query)
> +{
> +	struct ib_nl_request_info *rinfo;
> +
> +	rinfo = kzalloc(sizeof(*rinfo), GFP_ATOMIC);
> +	if (rinfo == NULL)
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&rinfo->list);
> +	rinfo->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
> +	rinfo->query = query;
> +
> +	return rinfo;
> +}
> +
> +static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
> +				     struct ib_nl_attr_info *info)
> +{
> +	struct ib_sa_path_rec *sa_rec = info->input;
> +
> +	if (info->comp_mask & IB_SA_PATH_REC_SERVICE_ID)
> +		nla_put(skb, LS_NLA_TYPE_SERVICE_ID,
> +			sizeof(sa_rec->service_id), &sa_rec->service_id);
> +	if (info->comp_mask & IB_SA_PATH_REC_DGID)
> +		nla_put(skb, LS_NLA_TYPE_DGID, sizeof(sa_rec->dgid),
> +			&sa_rec->dgid);
> +	if (info->comp_mask & IB_SA_PATH_REC_SGID)
> +		nla_put(skb, LS_NLA_TYPE_SGID, sizeof(sa_rec->sgid),
> +			&sa_rec->sgid);
> +	if (info->comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
> +		nla_put(skb, LS_NLA_TYPE_TCLASS, sizeof(sa_rec->traffic_class),
> +			&sa_rec->traffic_class);
> +	if (info->comp_mask & IB_SA_PATH_REC_REVERSIBLE)
> +		nla_put(skb, LS_NLA_TYPE_REVERSIBLE,
> +			sizeof(sa_rec->reversible), &sa_rec->reversible);
> +	if (info->comp_mask & IB_SA_PATH_REC_NUMB_PATH)
> +		nla_put(skb, LS_NLA_TYPE_NUMB_PATH,
> +			sizeof(sa_rec->numb_path), &sa_rec->numb_path);
> +	if (info->comp_mask & IB_SA_PATH_REC_PKEY)
> +		nla_put(skb, LS_NLA_TYPE_PKEY, sizeof(sa_rec->pkey),
> +			&sa_rec->pkey);
> +	if (info->comp_mask & IB_SA_PATH_REC_QOS_CLASS)
> +		nla_put(skb, LS_NLA_TYPE_QOS_CLASS,
> +			sizeof(sa_rec->qos_class), &sa_rec->qos_class);
> +}
> +
> +static int ib_nl_get_path_rec_attrs_len(struct ib_nl_attr_info *info)
> +{
> +	int len = 0;
> +
> +	if (info->comp_mask & IB_SA_PATH_REC_SERVICE_ID)
> +		len += nla_total_size(sizeof(struct rdma_nla_ls_service_id));
> +	if (info->comp_mask & IB_SA_PATH_REC_DGID)
> +		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
> +	if (info->comp_mask & IB_SA_PATH_REC_SGID)
> +		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
> +	if (info->comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
> +		len += nla_total_size(sizeof(struct rdma_nla_ls_tclass));
> +	if (info->comp_mask & IB_SA_PATH_REC_REVERSIBLE)
> +		len += nla_total_size(sizeof(struct rdma_nla_ls_reversible));
> +	if (info->comp_mask & IB_SA_PATH_REC_NUMB_PATH)
> +		len += nla_total_size(sizeof(struct rdma_nla_ls_numb_path));
> +	if (info->comp_mask & IB_SA_PATH_REC_PKEY)
> +		len += nla_total_size(sizeof(struct rdma_nla_ls_pkey));
> +	if (info->comp_mask & IB_SA_PATH_REC_QOS_CLASS)
> +		len += nla_total_size(sizeof(struct rdma_nla_ls_qos_class));
> +
> +	return len;
> +}
> +
> +static int ib_nl_send_request(struct ib_nl_request_info *rinfo)
> +{
> +	struct ib_nl_attr_info info;
> +	int opcode;
> +	struct ib_sa_mad *mad;
> +	unsigned long flags;
> +	unsigned long delay;
> +	int ret;
> +
> +	mad = rinfo->query->mad_buf->mad;
> +	switch (mad->mad_hdr.attr_id) {
> +	case cpu_to_be16(IB_SA_ATTR_PATH_REC):
> +		opcode = RDMA_NL_LS_OP_RESOLVE;
> +		mad = rinfo->query->mad_buf->mad;
> +		info.comp_mask = mad->sa_hdr.comp_mask;
> +		info.input = rinfo->query->mad_buf->context[1];
> +		rinfo->query->mad_buf->context[1] = NULL;
> +		info.len = ib_nl_get_path_rec_attrs_len(&info);
> +		info.set_attrs = ib_nl_set_path_rec_attrs;
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> +	ret = ib_nl_send_msg(opcode, &info, rinfo->seq);
> +	if (ret <= 0) {
> +		ret = -EIO;
> +		goto request_out;
> +	} else {
> +		ret = 0;
> +	}
> +
> +	delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
> +	rinfo->timeout = delay + jiffies;
> +	list_add_tail(&rinfo->list, &ib_nl_request_list);
> +	/* Start the timeout if this is the only request */
> +	if (ib_nl_request_list.next == &rinfo->list)
> +		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
> +
> +request_out:
> +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +
> +	return ret;
> +}
> +
> +static int ib_nl_make_request(struct ib_sa_query *query)
> +{
> +	struct ib_nl_request_info *rinfo;
> +	int ret;
> +
> +	rinfo = ib_nl_alloc_request(query);
> +	if (IS_ERR(rinfo))
> +		return -ENOMEM;
> +
> +	ret = ib_nl_send_request(rinfo);
> +	if (ret)
> +		kfree(rinfo);
> +
> +	return ret;
> +}
> +
> +static int ib_nl_cancel_request(struct ib_sa_query *query)
> +{
> +	unsigned long flags;
> +	struct ib_nl_request_info *rinfo;
> +	int found = 0;
> +
> +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> +	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
> +		/* Let the timeout to take care of the callback */
> +		if (query == rinfo->query) {
> +			IB_SA_CANCEL_QUERY(query);
> +			rinfo->timeout = jiffies;
> +			list_move(&rinfo->list, &ib_nl_request_list);
> +			found = 1;
> +			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
> +			break;
> +		}
> +	}
> +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +
> +	return found;
> +}
> +
> +static void send_handler(struct ib_mad_agent *agent,
> +			 struct ib_mad_send_wc *mad_send_wc);
> +
> +static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
> +					   const struct nlmsghdr *nlh)
> +{
> +	struct ib_mad_send_wc mad_send_wc;
> +	struct ib_sa_mad *mad = NULL;
> +	const struct nlattr *head, *curr;
> +	struct ib_path_rec_data  *rec;
> +	int len, rem;
> +
> +	if (query->callback) {
> +		head = (const struct nlattr *) nlmsg_data(nlh);
> +		len = nlmsg_len(nlh);
> +		nla_for_each_attr(curr, head, len, rem) {
> +			if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
> +				rec = nla_data(curr);
> +				if ((rec->flags && IB_PATH_GMP) &&
> +				    (rec->flags && IB_PATH_PRIMARY)) {
> +					mad = query->mad_buf->mad;
> +					mad->mad_hdr.method |=
> +						IB_MGMT_METHOD_RESP;
> +					memcpy(mad->data, rec->path_rec,
> +					       sizeof(rec->path_rec));
> +					query->callback(query, 0, mad);
> +					break;
> +				}
> +			}
> +		}
> +	}
> +
> +	mad_send_wc.send_buf = query->mad_buf;
> +	mad_send_wc.status = IB_WC_SUCCESS;
> +	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
> +}
> +
> +static void ib_nl_request_timeout(struct work_struct *work)
> +{
> +	unsigned long flags;
> +	struct ib_nl_request_info *rinfo;
> +	struct ib_sa_query *query;
> +	unsigned long delay;
> +	struct ib_mad_send_wc mad_send_wc;
> +	int ret;
> +
> +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> +	while (!list_empty(&ib_nl_request_list)) {
> +		rinfo = list_entry(ib_nl_request_list.next,
> +				   struct ib_nl_request_info, list);
> +
> +		if (time_after(rinfo->timeout, jiffies)) {
> +			delay = rinfo->timeout - jiffies;
> +			if ((long)delay <= 0)
> +				delay = 1;
> +			queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
> +			break;
> +		}
> +
> +		list_del(&rinfo->list);
> +		query = rinfo->query;
> +		IB_SA_DISABLE_LOCAL_SVC(query);
> +		/* Hold the lock to protect against query cancellation */
> +		if (IB_SA_QUERY_CANCELLED(query))
> +			ret = -1;
> +		else
> +			ret = ib_post_send_mad(query->mad_buf, NULL);
> +		if (ret) {
> +			mad_send_wc.send_buf = query->mad_buf;
> +			mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
> +			spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +			send_handler(query->port->agent, &mad_send_wc);
> +			spin_lock_irqsave(&ib_nl_request_lock, flags);
> +		}
> +		kfree(rinfo);
> +	}
> +	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +}
> +
> +static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
> +{
> +	const struct nlattr *head, *curr;
> +	struct ib_path_rec_data  *rec;
> +	int len, rem;
> +
> +	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
> +		return 0;
> +
> +	if (!(nlh->nlmsg_flags & RDMA_NL_LS_F_OK))
> +		return 0;
> +
> +	if (nlmsg_len(nlh) < nla_attr_size(sizeof(*rec)))
> +		return 0;
> +
> +	head = (const struct nlattr *) nlmsg_data(nlh);
> +	len = nlmsg_len(nlh);
> +	nla_for_each_attr(curr, head, len, rem) {
> +		if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
> +			rec = nla_data(curr);
> +			if ((rec->flags && IB_PATH_GMP) &&
> +			    (rec->flags && IB_PATH_PRIMARY))
> +				return 1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int ib_nl_handle_set_timeout(struct sk_buff *skb,
> +				    struct netlink_callback *cb)
> +{
> +	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
> +	int timeout, delta, abs_delta;
> +	const struct nlattr *attr;
> +	struct rdma_nla_ls_timeout *to_attr;
> +	unsigned long flags;
> +	struct ib_nl_request_info *rinfo;
> +	long delay = 0;
> +
> +	if (nlmsg_len(nlh) < nla_attr_size(sizeof(*to_attr)))
> +		goto settimeout_out;
> +
> +	attr = (const struct nlattr *) nlmsg_data(nlh);
> +	if (attr->nla_type != LS_NLA_TYPE_TIMEOUT ||
> +	    nla_len(attr) != sizeof(*to_attr))
> +		goto settimeout_out;
> +
> +	to_attr = (struct rdma_nla_ls_timeout *) nla_data(attr);
> +	timeout = (int) to_attr->timeout;
> +	if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
> +		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
> +	if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
> +		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
> +
> +	delta = timeout - sa_local_svc_timeout_ms;
> +	if (delta < 0)
> +		abs_delta = -delta;
> +	else
> +		abs_delta = delta;
> +
> +	if (delta != 0) {
> +		spin_lock_irqsave(&ib_nl_request_lock, flags);
> +		sa_local_svc_timeout_ms = timeout;
> +		list_for_each_entry(rinfo, &ib_nl_request_list, list) {
> +			if (delta < 0 && abs_delta > rinfo->timeout)
> +				rinfo->timeout = 0;
> +			else
> +				rinfo->timeout += delta;
> +
> +			/* Get the new delay from the first entry */
> +			if (!delay) {
> +				delay = rinfo->timeout - jiffies;
> +				if (delay <= 0)
> +					delay = 1;
> +			}
> +		}
> +		if (delay)
> +			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
> +					 (unsigned long)delay);
> +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +	}
> +
> +settimeout_out:
> +	return skb->len;
> +}
> +
> +static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
> +				     struct netlink_callback *cb)
> +{
> +	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
> +	unsigned long flags;
> +	struct ib_nl_request_info *rinfo;
> +	struct ib_sa_query *query;
> +	struct ib_mad_send_buf *send_buf;
> +	struct ib_mad_send_wc mad_send_wc;
> +	int found = 0;
> +	int ret;
> +
> +	spin_lock_irqsave(&ib_nl_request_lock, flags);
> +	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
> +		/*
> +		 * If the query is cancelled, let the timeout routine
> +		 * take care of it.
> +		 */
> +		if (nlh->nlmsg_seq == rinfo->seq) {
> +			found = !IB_SA_QUERY_CANCELLED(rinfo->query);
> +			if (found)
> +				list_del(&rinfo->list);
> +			break;
> +		}
> +	}
> +
> +	if (!found) {
> +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +		goto resp_out;
> +	}
> +
> +	query = rinfo->query;
> +	send_buf = query->mad_buf;
> +
> +	if (!ib_nl_is_good_resolve_resp(nlh)) {
> +		/* if the result is a failure, send out the packet via IB */
> +		IB_SA_DISABLE_LOCAL_SVC(query);
> +		ret = ib_post_send_mad(query->mad_buf, NULL);
> +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +		if (ret) {
> +			mad_send_wc.send_buf = send_buf;
> +			mad_send_wc.status = IB_WC_GENERAL_ERR;
> +			send_handler(query->port->agent, &mad_send_wc);
> +		}
> +	} else {
> +		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
> +		ib_nl_process_good_resolve_rsp(query, nlh);
> +	}
> +
> +	kfree(rinfo);
> +resp_out:
> +	return skb->len;
> +}
> +
> +static struct ibnl_client_cbs ib_sa_cb_table[] = {
> +	[RDMA_NL_LS_OP_RESOLVE] = {
> +		.dump = ib_nl_handle_resolve_resp,
> +		.module = THIS_MODULE },
> +	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
> +		.dump = ib_nl_handle_set_timeout,
> +		.module = THIS_MODULE },
> +};
> +
>  static void free_sm_ah(struct kref *kref)
>  {
>  	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
> @@ -502,7 +974,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
>  	mad_buf = query->mad_buf;
>  	spin_unlock_irqrestore(&idr_lock, flags);
>  
> -	ib_cancel_mad(agent, mad_buf);
> +	/*
> +	 * If the query is still on the netlink request list, schedule
> +	 * it to be cancelled by the timeout routine. Otherwise, it has been
> +	 * sent to the MAD layer and has to be cancelled from there.
> +	 */
> +	if (!ib_nl_cancel_request(query))
> +		ib_cancel_mad(agent, mad_buf);
>  }
>  EXPORT_SYMBOL(ib_sa_cancel_query);
>  
> @@ -638,6 +1116,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
>  	query->mad_buf->context[0] = query;
>  	query->id = id;
>  
> +	if (IB_SA_LOCAL_SVC_ENABLED(query)) {
> +		if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
> +			if (!ib_nl_make_request(query))
> +				return id;
> +		}
> +		IB_SA_DISABLE_LOCAL_SVC(query);
> +	}
> +
>  	ret = ib_post_send_mad(query->mad_buf, NULL);
>  	if (ret) {
>  		spin_lock_irqsave(&idr_lock, flags);
> @@ -766,6 +1252,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
>  
>  	*sa_query = &query->sa_query;
>  
> +	IB_SA_ENABLE_LOCAL_SVC(&query->sa_query);
> +	query->sa_query.mad_buf->context[1] = rec;
> +
>  	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
>  	if (ret < 0)
>  		goto err2;
> @@ -1250,6 +1739,8 @@ static int __init ib_sa_init(void)
>  
>  	get_random_bytes(&tid, sizeof tid);
>  
> +	atomic_set(&ib_nl_sa_request_seq, 0);
> +
>  	ret = ib_register_client(&sa_client);
>  	if (ret) {
>  		printk(KERN_ERR "Couldn't register ib_sa client\n");
> @@ -1262,7 +1753,25 @@ static int __init ib_sa_init(void)
>  		goto err2;
>  	}
>  
> +	ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
> +	if (!ib_nl_wq) {
> +		ret = -ENOMEM;
> +		goto err3;
> +	}
> +
> +	if (ibnl_add_client(RDMA_NL_SA, RDMA_NL_LS_NUM_OPS,
> +			    ib_sa_cb_table)) {
> +		pr_err("Failed to add netlink callback\n");
> +		ret = -EINVAL;
> +		goto err4;
> +	}
> +	INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
> +
>  	return 0;
> +err4:
> +	destroy_workqueue(ib_nl_wq);
> +err3:
> +	mcast_cleanup();
>  err2:
>  	ib_unregister_client(&sa_client);
>  err1:
> @@ -1271,6 +1780,10 @@ err1:
>  
>  static void __exit ib_sa_cleanup(void)
>  {
> +	ibnl_remove_client(RDMA_NL_SA);
> +	cancel_delayed_work(&ib_nl_timed_work);
> +	flush_workqueue(ib_nl_wq);
> +	destroy_workqueue(ib_nl_wq);
>  	mcast_cleanup();
>  	ib_unregister_client(&sa_client);
>  	idr_destroy(&query_idr);
> diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
> index 341e9be..6aa9281 100644
> --- a/include/uapi/rdma/rdma_netlink.h
> +++ b/include/uapi/rdma/rdma_netlink.h
> @@ -151,7 +151,7 @@ enum {
>  	LS_NLA_TYPE_SGID,
>  	LS_NLA_TYPE_TCLASS,
>  	LS_NLA_TYPE_REVERSIBLE,
> -	LS_NLA_TYPE_NUM_PATH,
> +	LS_NLA_TYPE_NUMB_PATH,

Looks to me like this should've been in patch 1.

>  	LS_NLA_TYPE_PKEY,
>  	LS_NLA_TYPE_QOS_CLASS,
>  	LS_NLA_TYPE_MAX

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe June 10, 2015, 7:10 p.m. UTC | #2
On Wed, Jun 10, 2015 at 01:47:36PM -0400, Hal Rosenstock wrote:
> On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote:
> > From: Kaike Wan <kaike.wan@intel.com>
> > 
> > This patch routes a SA pathrecord query to netlink first
> 
> Should only unicast PRs be done in this manner or should API support
> enabling for unicast and/or multicast ?
> 
> AFAIK kernel doesn't query multicast PRs now (queries MCMRs) but this
> seems like it would help make it future proof and not have to take
> timeout on local query unless app supports it.

It is a good question. We can clearly extend toward that, using a MGID
as the DGID and adding additional nested netlink fields.

However, does it make sense? I don't think user space should be
joining on behalf of the kernel... Do we ever query without a join?

Jasno
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hal Rosenstock June 10, 2015, 7:51 p.m. UTC | #3
On 6/10/2015 3:10 PM, Jason Gunthorpe wrote:
> On Wed, Jun 10, 2015 at 01:47:36PM -0400, Hal Rosenstock wrote:
>> On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote:
>>> From: Kaike Wan <kaike.wan@intel.com>
>>>
>>> This patch routes a SA pathrecord query to netlink first
>>
>> Should only unicast PRs be done in this manner or should API support
>> enabling for unicast and/or multicast ?
>>
>> AFAIK kernel doesn't query multicast PRs now (queries MCMRs) but this
>> seems like it would help make it future proof and not have to take
>> timeout on local query unless app supports it.
> 
> It is a good question. We can clearly extend toward that, using a MGID
> as the DGID and adding additional nested netlink fields.
> 
> However, does it make sense? 

If it doesn't make sense, then should MGIDs as DGIDs never be requested
via the PR netlink API ?

> I don't think user space should be
> joining on behalf of the kernel... 

Agreed.

> Do we ever query without a join?

Query is possible prior to join.

The use case I'm aware of is doing query PR for MGID rather than query
MCMR. That case could be viewed just like unicast PRs.

-- Hal

> Jasno
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny June 10, 2015, 9:09 p.m. UTC | #4
> On 6/10/2015 3:10 PM, Jason Gunthorpe wrote:
> > On Wed, Jun 10, 2015 at 01:47:36PM -0400, Hal Rosenstock wrote:
> >> On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote:
> >>> From: Kaike Wan <kaike.wan@intel.com>
> >>>
> >>> This patch routes a SA pathrecord query to netlink first
> >>
> >> Should only unicast PRs be done in this manner or should API support
> >> enabling for unicast and/or multicast ?
> >>
> >> AFAIK kernel doesn't query multicast PRs now (queries MCMRs) but this
> >> seems like it would help make it future proof and not have to take
> >> timeout on local query unless app supports it.
> >
> > It is a good question. We can clearly extend toward that, using a MGID
> > as the DGID and adding additional nested netlink fields.
> >
> > However, does it make sense?
> 
> If it doesn't make sense, then should MGIDs as DGIDs never be requested via
> the PR netlink API ?
> 

Why should we prevent it?  What does it hurt?

Ira
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hal Rosenstock June 11, 2015, 12:24 p.m. UTC | #5
On 6/10/2015 5:09 PM, Weiny, Ira wrote:
>> On 6/10/2015 3:10 PM, Jason Gunthorpe wrote:
>>> On Wed, Jun 10, 2015 at 01:47:36PM -0400, Hal Rosenstock wrote:
>>>> On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote:
>>>>> From: Kaike Wan <kaike.wan@intel.com>
>>>>>
>>>>> This patch routes a SA pathrecord query to netlink first
>>>>
>>>> Should only unicast PRs be done in this manner or should API support
>>>> enabling for unicast and/or multicast ?
>>>>
>>>> AFAIK kernel doesn't query multicast PRs now (queries MCMRs) but this
>>>> seems like it would help make it future proof and not have to take
>>>> timeout on local query unless app supports it.
>>>
>>> It is a good question. We can clearly extend toward that, using a MGID
>>> as the DGID and adding additional nested netlink fields.
>>>
>>> However, does it make sense?
>>
>> If it doesn't make sense, then should MGIDs as DGIDs never be requested via
>> the PR netlink API ?
>>
> 
> Why should we prevent it?  What does it hurt?

It's merely an optimization in that round trip to user space is avoided
with user space needing to perform some validation/lookup which would
fail in case multicast PRs not supported (which is case for ACM with
multicast backend).

If user space PR capabilities (unicast, multicast, both) is supported,
it affects the API. Maybe it's overkill but requires user space to
indicate no PR available for multicast DGIDs. I think this is the
tradeoff to be made/decided.

-- Hal

> Ira
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wan, Kaike June 11, 2015, 12:54 p.m. UTC | #6
> From: Hal Rosenstock [mailto:hal@dev.mellanox.co.il]
> Sent: Thursday, June 11, 2015 8:24 AM
> To: Weiny, Ira
> Cc: Jason Gunthorpe; Wan, Kaike; linux-rdma@vger.kernel.org; Fleck, John
> Subject: Re: [PATCH v4 4/4] IB/sa: Route SA pathrecord query through netlink
> 
> On 6/10/2015 5:09 PM, Weiny, Ira wrote:
> >> On 6/10/2015 3:10 PM, Jason Gunthorpe wrote:
> >>> On Wed, Jun 10, 2015 at 01:47:36PM -0400, Hal Rosenstock wrote:
> >>>> On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote:
> >>>>> From: Kaike Wan <kaike.wan@intel.com>
> >>>>>
> >>>>> This patch routes a SA pathrecord query to netlink first
> >>>>
> >>>> Should only unicast PRs be done in this manner or should API
> >>>> support enabling for unicast and/or multicast ?
> >>>>
> >>>> AFAIK kernel doesn't query multicast PRs now (queries MCMRs) but
> >>>> this seems like it would help make it future proof and not have to
> >>>> take timeout on local query unless app supports it.
> >>>
> >>> It is a good question. We can clearly extend toward that, using a
> >>> MGID as the DGID and adding additional nested netlink fields.
> >>>
> >>> However, does it make sense?
> >>
> >> If it doesn't make sense, then should MGIDs as DGIDs never be
> >> requested via the PR netlink API ?
> >>
> >
> > Why should we prevent it?  What does it hurt?
> 
> It's merely an optimization in that round trip to user space is avoided with
> user space needing to perform some validation/lookup which would fail in
> case multicast PRs not supported (which is case for ACM with multicast
> backend).

If the kernel client can query SA for mulitcast PRs, why can't ibacm (even backed by acmp)? Is there any code there that prevents a mgid being used as the destination?

> 
> If user space PR capabilities (unicast, multicast, both) is supported, it affects
> the API. 
How? If we can query SA for multicast PRs without joining the multicast groups, what additional changes in netlink API do we need to support both?

>Maybe it's overkill but requires user space to indicate no PR available
> for multicast DGIDs. I think this is the tradeoff to be made/decided.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hal Rosenstock June 11, 2015, 1:25 p.m. UTC | #7
On 6/11/2015 8:54 AM, Wan, Kaike wrote:
>> From: Hal Rosenstock [mailto:hal@dev.mellanox.co.il]
>> Sent: Thursday, June 11, 2015 8:24 AM
>> To: Weiny, Ira
>> Cc: Jason Gunthorpe; Wan, Kaike; linux-rdma@vger.kernel.org; Fleck, John
>> Subject: Re: [PATCH v4 4/4] IB/sa: Route SA pathrecord query through netlink
>>
>> On 6/10/2015 5:09 PM, Weiny, Ira wrote:
>>>> On 6/10/2015 3:10 PM, Jason Gunthorpe wrote:
>>>>> On Wed, Jun 10, 2015 at 01:47:36PM -0400, Hal Rosenstock wrote:
>>>>>> On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote:
>>>>>>> From: Kaike Wan <kaike.wan@intel.com>
>>>>>>>
>>>>>>> This patch routes a SA pathrecord query to netlink first
>>>>>>
>>>>>> Should only unicast PRs be done in this manner or should API
>>>>>> support enabling for unicast and/or multicast ?
>>>>>>
>>>>>> AFAIK kernel doesn't query multicast PRs now (queries MCMRs) but
>>>>>> this seems like it would help make it future proof and not have to
>>>>>> take timeout on local query unless app supports it.
>>>>>
>>>>> It is a good question. We can clearly extend toward that, using a
>>>>> MGID as the DGID and adding additional nested netlink fields.
>>>>>
>>>>> However, does it make sense?
>>>>
>>>> If it doesn't make sense, then should MGIDs as DGIDs never be
>>>> requested via the PR netlink API ?
>>>>
>>>
>>> Why should we prevent it?  What does it hurt?
>>
>> It's merely an optimization in that round trip to user space is avoided with
>> user space needing to perform some validation/lookup which would fail in
>> case multicast PRs not supported (which is case for ACM with multicast
>> backend).
> 
> If the kernel client can query SA for mulitcast PRs, why can't ibacm (even backed by acmp)? 

It can.

> Is there any code there that prevents a mgid being used as the destination?

No. It's a matter of optimization only.

>>
>> If user space PR capabilities (unicast, multicast, both) is supported, it affects
>> the API. 
> How? If we can query SA for multicast PRs without joining the multicast groups, 
> what additional changes in netlink API do we need to support both?

Nothing to support both but if we wanted to disable one or other based
on user space we would but it sounds like we don't need/want this but
would use user space rejection/no data for this.

>> Maybe it's overkill but requires user space to indicate no PR available
>> for multicast DGIDs. I think this is the tradeoff to be made/decided.
>>
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny June 11, 2015, 4:08 p.m. UTC | #8
> >> If user space PR capabilities (unicast, multicast, both) is
> >> supported, it affects the API.
> > How? If we can query SA for multicast PRs without joining the
> > multicast groups, what additional changes in netlink API do we need to
> support both?
> 
> Nothing to support both but if we wanted to disable one or other based on user
> space we would but it sounds like we don't need/want this but would use user
> space rejection/no data for this.
> 

Yes I think we should allow userspace to decide if this is supported or not.  That allows for more flexibility.

Ira

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 17e1cf7..e063593 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -45,12 +45,21 @@ 
 #include <uapi/linux/if_ether.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
 #include "sa.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand subnet administration query support");
 MODULE_LICENSE("Dual BSD/GPL");
 
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX		200000
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
 struct ib_sa_sm_ah {
 	struct ib_ah        *ah;
 	struct kref          ref;
@@ -80,8 +89,24 @@  struct ib_sa_query {
 	struct ib_mad_send_buf *mad_buf;
 	struct ib_sa_sm_ah     *sm_ah;
 	int			id;
+	u32			flags;
 };
 
+#define IB_SA_ENABLE_LOCAL_SERVICE	0x00000001
+#define IB_SA_CANCEL			0x00000002
+
+#define IB_SA_LOCAL_SVC_ENABLED(query) \
+	((query)->flags & IB_SA_ENABLE_LOCAL_SERVICE)
+#define IB_SA_ENABLE_LOCAL_SVC(query) \
+	((query)->flags |= IB_SA_ENABLE_LOCAL_SERVICE)
+#define IB_SA_DISABLE_LOCAL_SVC(query) \
+	((query)->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE)
+
+#define IB_SA_QUERY_CANCELLED(query) \
+	((query)->flags & IB_SA_CANCEL)
+#define IB_SA_CANCEL_QUERY(query) \
+	((query)->flags |= IB_SA_CANCEL)
+
 struct ib_sa_service_query {
 	void (*callback)(int, struct ib_sa_service_rec *, void *);
 	void *context;
@@ -106,6 +131,26 @@  struct ib_sa_mcmember_query {
 	struct ib_sa_query sa_query;
 };
 
+struct ib_nl_request_info {
+	struct list_head list;
+	u32 seq;
+	unsigned long timeout;
+	struct ib_sa_query *query;
+};
+
+struct ib_nl_attr_info {
+	u16 len;	/* Total attr len: header + payload + padding */
+	ib_sa_comp_mask comp_mask;
+	void *input;
+	void (*set_attrs)(struct sk_buff *skb, struct ib_nl_attr_info *info);
+};
+
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+
 static void ib_sa_add_one(struct ib_device *device);
 static void ib_sa_remove_one(struct ib_device *device);
 
@@ -381,6 +426,433 @@  static const struct ib_field guidinfo_rec_table[] = {
 	  .size_bits    = 512 },
 };
 
+static int ib_nl_send_msg(int opcode, struct ib_nl_attr_info *attrs, u32 seq)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	void *data;
+	int ret = 0;
+
+	if (attrs->len <= 0)
+		return -EMSGSIZE;
+
+	skb = nlmsg_new(attrs->len, GFP_KERNEL);
+	if (!skb) {
+		pr_err("alloc failed ret=%d\n", ret);
+		return -ENOMEM;
+	}
+
+	/* Put nlmsg header only for now */
+	data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_SA,
+			    opcode, GFP_KERNEL);
+	if (!data) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	/* Add attributes */
+	attrs->set_attrs(skb, attrs);
+
+	/* Repair the nlmsg header length */
+	nlmsg_end(skb, nlh);
+
+	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+	if (!ret) {
+		ret = attrs->len;
+	} else {
+		if (ret != -ESRCH)
+			pr_err("ibnl_multicast failed l=%d, r=%d\n",
+			       attrs->len, ret);
+		ret = 0;
+	}
+	return ret;
+}
+
+static struct ib_nl_request_info *
+ib_nl_alloc_request(struct ib_sa_query *query)
+{
+	struct ib_nl_request_info *rinfo;
+
+	rinfo = kzalloc(sizeof(*rinfo), GFP_ATOMIC);
+	if (rinfo == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&rinfo->list);
+	rinfo->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+	rinfo->query = query;
+
+	return rinfo;
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+				     struct ib_nl_attr_info *info)
+{
+	struct ib_sa_path_rec *sa_rec = info->input;
+
+	if (info->comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+		nla_put(skb, LS_NLA_TYPE_SERVICE_ID,
+			sizeof(sa_rec->service_id), &sa_rec->service_id);
+	if (info->comp_mask & IB_SA_PATH_REC_DGID)
+		nla_put(skb, LS_NLA_TYPE_DGID, sizeof(sa_rec->dgid),
+			&sa_rec->dgid);
+	if (info->comp_mask & IB_SA_PATH_REC_SGID)
+		nla_put(skb, LS_NLA_TYPE_SGID, sizeof(sa_rec->sgid),
+			&sa_rec->sgid);
+	if (info->comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+		nla_put(skb, LS_NLA_TYPE_TCLASS, sizeof(sa_rec->traffic_class),
+			&sa_rec->traffic_class);
+	if (info->comp_mask & IB_SA_PATH_REC_REVERSIBLE)
+		nla_put(skb, LS_NLA_TYPE_REVERSIBLE,
+			sizeof(sa_rec->reversible), &sa_rec->reversible);
+	if (info->comp_mask & IB_SA_PATH_REC_NUMB_PATH)
+		nla_put(skb, LS_NLA_TYPE_NUMB_PATH,
+			sizeof(sa_rec->numb_path), &sa_rec->numb_path);
+	if (info->comp_mask & IB_SA_PATH_REC_PKEY)
+		nla_put(skb, LS_NLA_TYPE_PKEY, sizeof(sa_rec->pkey),
+			&sa_rec->pkey);
+	if (info->comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+		nla_put(skb, LS_NLA_TYPE_QOS_CLASS,
+			sizeof(sa_rec->qos_class), &sa_rec->qos_class);
+}
+
+static int ib_nl_get_path_rec_attrs_len(struct ib_nl_attr_info *info)
+{
+	int len = 0;
+
+	if (info->comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_service_id));
+	if (info->comp_mask & IB_SA_PATH_REC_DGID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+	if (info->comp_mask & IB_SA_PATH_REC_SGID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+	if (info->comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_tclass));
+	if (info->comp_mask & IB_SA_PATH_REC_REVERSIBLE)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_reversible));
+	if (info->comp_mask & IB_SA_PATH_REC_NUMB_PATH)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_numb_path));
+	if (info->comp_mask & IB_SA_PATH_REC_PKEY)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_pkey));
+	if (info->comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_qos_class));
+
+	return len;
+}
+
+static int ib_nl_send_request(struct ib_nl_request_info *rinfo)
+{
+	struct ib_nl_attr_info info;
+	int opcode;
+	struct ib_sa_mad *mad;
+	unsigned long flags;
+	unsigned long delay;
+	int ret;
+
+	mad = rinfo->query->mad_buf->mad;
+	switch (mad->mad_hdr.attr_id) {
+	case cpu_to_be16(IB_SA_ATTR_PATH_REC):
+		opcode = RDMA_NL_LS_OP_RESOLVE;
+		mad = rinfo->query->mad_buf->mad;
+		info.comp_mask = mad->sa_hdr.comp_mask;
+		info.input = rinfo->query->mad_buf->context[1];
+		rinfo->query->mad_buf->context[1] = NULL;
+		info.len = ib_nl_get_path_rec_attrs_len(&info);
+		info.set_attrs = ib_nl_set_path_rec_attrs;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	ret = ib_nl_send_msg(opcode, &info, rinfo->seq);
+	if (ret <= 0) {
+		ret = -EIO;
+		goto request_out;
+	} else {
+		ret = 0;
+	}
+
+	delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+	rinfo->timeout = delay + jiffies;
+	list_add_tail(&rinfo->list, &ib_nl_request_list);
+	/* Start the timeout if this is the only request */
+	if (ib_nl_request_list.next == &rinfo->list)
+		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+
+request_out:
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query)
+{
+	struct ib_nl_request_info *rinfo;
+	int ret;
+
+	rinfo = ib_nl_alloc_request(query);
+	if (IS_ERR(rinfo))
+		return -ENOMEM;
+
+	ret = ib_nl_send_request(rinfo);
+	if (ret)
+		kfree(rinfo);
+
+	return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_nl_request_info *rinfo;
+	int found = 0;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
+		/* Let the timeout to take care of the callback */
+		if (query == rinfo->query) {
+			IB_SA_CANCEL_QUERY(query);
+			rinfo->timeout = jiffies;
+			list_move(&rinfo->list, &ib_nl_request_list);
+			found = 1;
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+					   const struct nlmsghdr *nlh)
+{
+	struct ib_mad_send_wc mad_send_wc;
+	struct ib_sa_mad *mad = NULL;
+	const struct nlattr *head, *curr;
+	struct ib_path_rec_data  *rec;
+	int len, rem;
+
+	if (query->callback) {
+		head = (const struct nlattr *) nlmsg_data(nlh);
+		len = nlmsg_len(nlh);
+		nla_for_each_attr(curr, head, len, rem) {
+			if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+				rec = nla_data(curr);
+				if ((rec->flags && IB_PATH_GMP) &&
+				    (rec->flags && IB_PATH_PRIMARY)) {
+					mad = query->mad_buf->mad;
+					mad->mad_hdr.method |=
+						IB_MGMT_METHOD_RESP;
+					memcpy(mad->data, rec->path_rec,
+					       sizeof(rec->path_rec));
+					query->callback(query, 0, mad);
+					break;
+				}
+			}
+		}
+	}
+
+	mad_send_wc.send_buf = query->mad_buf;
+	mad_send_wc.status = IB_WC_SUCCESS;
+	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+	unsigned long flags;
+	struct ib_nl_request_info *rinfo;
+	struct ib_sa_query *query;
+	unsigned long delay;
+	struct ib_mad_send_wc mad_send_wc;
+	int ret;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	while (!list_empty(&ib_nl_request_list)) {
+		rinfo = list_entry(ib_nl_request_list.next,
+				   struct ib_nl_request_info, list);
+
+		if (time_after(rinfo->timeout, jiffies)) {
+			delay = rinfo->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+			break;
+		}
+
+		list_del(&rinfo->list);
+		query = rinfo->query;
+		IB_SA_DISABLE_LOCAL_SVC(query);
+		/* Hold the lock to protect against query cancellation */
+		if (IB_SA_QUERY_CANCELLED(query))
+			ret = -1;
+		else
+			ret = ib_post_send_mad(query->mad_buf, NULL);
+		if (ret) {
+			mad_send_wc.send_buf = query->mad_buf;
+			mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+			spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+			send_handler(query->port->agent, &mad_send_wc);
+			spin_lock_irqsave(&ib_nl_request_lock, flags);
+		}
+		kfree(rinfo);
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+	const struct nlattr *head, *curr;
+	struct ib_path_rec_data  *rec;
+	int len, rem;
+
+	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+		return 0;
+
+	if (!(nlh->nlmsg_flags & RDMA_NL_LS_F_OK))
+		return 0;
+
+	if (nlmsg_len(nlh) < nla_attr_size(sizeof(*rec)))
+		return 0;
+
+	head = (const struct nlattr *) nlmsg_data(nlh);
+	len = nlmsg_len(nlh);
+	nla_for_each_attr(curr, head, len, rem) {
+		if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+			rec = nla_data(curr);
+			if ((rec->flags && IB_PATH_GMP) &&
+			    (rec->flags && IB_PATH_PRIMARY))
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int ib_nl_handle_set_timeout(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+	int timeout, delta, abs_delta;
+	const struct nlattr *attr;
+	struct rdma_nla_ls_timeout *to_attr;
+	unsigned long flags;
+	struct ib_nl_request_info *rinfo;
+	long delay = 0;
+
+	if (nlmsg_len(nlh) < nla_attr_size(sizeof(*to_attr)))
+		goto settimeout_out;
+
+	attr = (const struct nlattr *) nlmsg_data(nlh);
+	if (attr->nla_type != LS_NLA_TYPE_TIMEOUT ||
+	    nla_len(attr) != sizeof(*to_attr))
+		goto settimeout_out;
+
+	to_attr = (struct rdma_nla_ls_timeout *) nla_data(attr);
+	timeout = (int) to_attr->timeout;
+	if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+	if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+	delta = timeout - sa_local_svc_timeout_ms;
+	if (delta < 0)
+		abs_delta = -delta;
+	else
+		abs_delta = delta;
+
+	if (delta != 0) {
+		spin_lock_irqsave(&ib_nl_request_lock, flags);
+		sa_local_svc_timeout_ms = timeout;
+		list_for_each_entry(rinfo, &ib_nl_request_list, list) {
+			if (delta < 0 && abs_delta > rinfo->timeout)
+				rinfo->timeout = 0;
+			else
+				rinfo->timeout += delta;
+
+			/* Get the new delay from the first entry */
+			if (!delay) {
+				delay = rinfo->timeout - jiffies;
+				if (delay <= 0)
+					delay = 1;
+			}
+		}
+		if (delay)
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+					 (unsigned long)delay);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+	}
+
+settimeout_out:
+	return skb->len;
+}
+
+static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+	unsigned long flags;
+	struct ib_nl_request_info *rinfo;
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_send_wc mad_send_wc;
+	int found = 0;
+	int ret;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(rinfo, &ib_nl_request_list, list) {
+		/*
+		 * If the query is cancelled, let the timeout routine
+		 * take care of it.
+		 */
+		if (nlh->nlmsg_seq == rinfo->seq) {
+			found = !IB_SA_QUERY_CANCELLED(rinfo->query);
+			if (found)
+				list_del(&rinfo->list);
+			break;
+		}
+	}
+
+	if (!found) {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		goto resp_out;
+	}
+
+	query = rinfo->query;
+	send_buf = query->mad_buf;
+
+	if (!ib_nl_is_good_resolve_resp(nlh)) {
+		/* if the result is a failure, send out the packet via IB */
+		IB_SA_DISABLE_LOCAL_SVC(query);
+		ret = ib_post_send_mad(query->mad_buf, NULL);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		if (ret) {
+			mad_send_wc.send_buf = send_buf;
+			mad_send_wc.status = IB_WC_GENERAL_ERR;
+			send_handler(query->port->agent, &mad_send_wc);
+		}
+	} else {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		ib_nl_process_good_resolve_rsp(query, nlh);
+	}
+
+	kfree(rinfo);
+resp_out:
+	return skb->len;
+}
+
+static struct ibnl_client_cbs ib_sa_cb_table[] = {
+	[RDMA_NL_LS_OP_RESOLVE] = {
+		.dump = ib_nl_handle_resolve_resp,
+		.module = THIS_MODULE },
+	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
+		.dump = ib_nl_handle_set_timeout,
+		.module = THIS_MODULE },
+};
+
 static void free_sm_ah(struct kref *kref)
 {
 	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -502,7 +974,13 @@  void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 	mad_buf = query->mad_buf;
 	spin_unlock_irqrestore(&idr_lock, flags);
 
-	ib_cancel_mad(agent, mad_buf);
+	/*
+	 * If the query is still on the netlink request list, schedule
+	 * it to be cancelled by the timeout routine. Otherwise, it has been
+	 * sent to the MAD layer and has to be cancelled from there.
+	 */
+	if (!ib_nl_cancel_request(query))
+		ib_cancel_mad(agent, mad_buf);
 }
 EXPORT_SYMBOL(ib_sa_cancel_query);
 
@@ -638,6 +1116,14 @@  static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
 	query->mad_buf->context[0] = query;
 	query->id = id;
 
+	if (IB_SA_LOCAL_SVC_ENABLED(query)) {
+		if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+			if (!ib_nl_make_request(query))
+				return id;
+		}
+		IB_SA_DISABLE_LOCAL_SVC(query);
+	}
+
 	ret = ib_post_send_mad(query->mad_buf, NULL);
 	if (ret) {
 		spin_lock_irqsave(&idr_lock, flags);
@@ -766,6 +1252,9 @@  int ib_sa_path_rec_get(struct ib_sa_client *client,
 
 	*sa_query = &query->sa_query;
 
+	IB_SA_ENABLE_LOCAL_SVC(&query->sa_query);
+	query->sa_query.mad_buf->context[1] = rec;
+
 	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
 	if (ret < 0)
 		goto err2;
@@ -1250,6 +1739,8 @@  static int __init ib_sa_init(void)
 
 	get_random_bytes(&tid, sizeof tid);
 
+	atomic_set(&ib_nl_sa_request_seq, 0);
+
 	ret = ib_register_client(&sa_client);
 	if (ret) {
 		printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1262,7 +1753,25 @@  static int __init ib_sa_init(void)
 		goto err2;
 	}
 
+	ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
+	if (!ib_nl_wq) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+
+	if (ibnl_add_client(RDMA_NL_SA, RDMA_NL_LS_NUM_OPS,
+			    ib_sa_cb_table)) {
+		pr_err("Failed to add netlink callback\n");
+		ret = -EINVAL;
+		goto err4;
+	}
+	INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
 	return 0;
+err4:
+	destroy_workqueue(ib_nl_wq);
+err3:
+	mcast_cleanup();
 err2:
 	ib_unregister_client(&sa_client);
 err1:
@@ -1271,6 +1780,10 @@  err1:
 
 static void __exit ib_sa_cleanup(void)
 {
+	ibnl_remove_client(RDMA_NL_SA);
+	cancel_delayed_work(&ib_nl_timed_work);
+	flush_workqueue(ib_nl_wq);
+	destroy_workqueue(ib_nl_wq);
 	mcast_cleanup();
 	ib_unregister_client(&sa_client);
 	idr_destroy(&query_idr);
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 341e9be..6aa9281 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -151,7 +151,7 @@  enum {
 	LS_NLA_TYPE_SGID,
 	LS_NLA_TYPE_TCLASS,
 	LS_NLA_TYPE_REVERSIBLE,
-	LS_NLA_TYPE_NUM_PATH,
+	LS_NLA_TYPE_NUMB_PATH,
 	LS_NLA_TYPE_PKEY,
 	LS_NLA_TYPE_QOS_CLASS,
 	LS_NLA_TYPE_MAX