Message ID | 1433861878-26264-1-git-send-email-kaike.wan@intel.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote: > From: Kaike Wan <kaike.wan@intel.com> > > This patch enables ibacm to process pathrecord queries through netlink. > Since ibacm can cache pathrecords, this implementation provides an easy > pathrecord cache for kernel components and therefore offers great > performance advantage on large fabric systems. > > Signed-off-by: Kaike Wan <kaike.wan@intel.com> > Signed-off-by: John Fleck <john.fleck@intel.com> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Sean Hefty <sean.hefty@intel.com> > --- > src/acm.c | 360 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 files changed, 357 insertions(+), 3 deletions(-) > > diff --git a/src/acm.c b/src/acm.c > index 7649725..d180bd2 100644 > --- a/src/acm.c > +++ b/src/acm.c > @@ -46,6 +46,8 @@ > #include <infiniband/acm_prov.h> > #include <infiniband/umad.h> > #include <infiniband/verbs.h> > +#include <infiniband/umad_types.h> > +#include <infiniband/umad_sa.h> > #include <dlist.h> > #include <dlfcn.h> > #include <search.h> > @@ -55,6 +57,8 @@ > #include <netinet/in.h> > #include <linux/netlink.h> > #include <linux/rtnetlink.h> > +#include <rdma/rdma_netlink.h> > +#include <rdma/ib_user_sa.h> > #include <poll.h> > #include "acm_mad.h" > #include "acm_util.h" > @@ -66,6 +70,7 @@ > #define MAX_EP_ADDR 4 > #define NL_MSG_BUF_SIZE 4096 > #define ACM_PROV_NAME_SIZE 64 > +#define NL_CLIENT_INDEX 0 > > struct acmc_subnet { > DLIST_ENTRY entry; > @@ -151,6 +156,26 @@ struct acmc_sa_req { > struct acm_sa_mad mad; > }; > > +struct acm_nl_status { > + struct nlattr attr_hdr; > + struct rdma_nla_ls_status status; > +}; > + > +struct acm_nl_path { > + struct nlattr attr_hdr; > + struct ib_path_rec_data rec; > +}; > + > +struct acm_nl_msg { > + struct nlmsghdr nlmsg_header; > + union { > + uint8_t data[ACM_MSG_DATA_LENGTH]; > + struct nlattr attr[0]; > + struct acm_nl_status status[0]; > + struct acm_nl_path path[0]; > + }; > +}; > + > static char def_prov_name[ACM_PROV_NAME_SIZE] = "ibacmp"; > static DLIST_ENTRY provider_list; > static struct acmc_prov *def_provider = NULL; > @@ -172,6 +197,7 @@ static struct acmc_ep *acm_find_ep(struct acmc_port *port, uint16_t pkey); > static int acm_ep_insert_addr(struct acmc_ep *ep, const char *name, uint8_t *addr, > size_t addr_len, uint8_t addr_type); > static void acm_event_handler(struct acmc_device *dev); > +static int acm_nl_send(SOCKET sock, struct acm_msg *msg); > > static struct sa_data { > int timeout; > @@ -466,7 +492,11 @@ int acm_resolve_response(uint64_t id, struct acm_msg *msg) > goto release; > } > > - ret = send(client->sock, (char *) msg, msg->hdr.length, 0); > + if (id == NL_CLIENT_INDEX) > + ret = acm_nl_send(client->sock, msg); > + else > + ret = send(client->sock, (char *) msg, msg->hdr.length, 0); > + > if (ret != msg->hdr.length) > acm_log(0, "ERROR - failed to send response\n"); > else > @@ -597,6 +627,8 @@ static void acm_svr_accept(void) > } > > for (i = 0; i < FD_SETSIZE - 1; i++) { > + if (i == NL_CLIENT_INDEX) > + continue; > if (!atomic_get(&client_array[i].refcnt)) > break; > } > @@ -1346,6 +1378,323 @@ static void acm_ipnl_handler(void) > } > } > > +static int acm_nl_send(SOCKET sock, struct acm_msg *msg) > +{ > + struct sockaddr_nl dst_addr; > + struct acm_nl_msg acmnlmsg; > + struct acm_nl_msg *orig; > + int ret; > + int datalen; > + > + orig = (struct acm_nl_msg *) msg->hdr.tid; > + > + memset(&dst_addr, 0, sizeof(dst_addr)); > + dst_addr.nl_family = AF_NETLINK; > + dst_addr.nl_groups = (1 << (RDMA_NL_GROUP_LS - 1)); > + > + memset(&acmnlmsg, 0, sizeof(acmnlmsg)); > + acmnlmsg.nlmsg_header.nlmsg_len = NLMSG_HDRLEN; > + acmnlmsg.nlmsg_header.nlmsg_pid = getpid(); > + acmnlmsg.nlmsg_header.nlmsg_type = orig->nlmsg_header.nlmsg_type; > + acmnlmsg.nlmsg_header.nlmsg_flags = NLM_F_REQUEST; > + acmnlmsg.nlmsg_header.nlmsg_seq = orig->nlmsg_header.nlmsg_seq; > + > + if (msg->hdr.status != ACM_STATUS_SUCCESS) { > + acm_log(2, "acm status no success = %d\n", msg->hdr.status); > + acmnlmsg.nlmsg_header.nlmsg_flags |= RDMA_NL_LS_F_ERR; > + acmnlmsg.nlmsg_header.nlmsg_len += > + NLA_ALIGN(sizeof(struct acm_nl_status)); > + acmnlmsg.status[0].attr_hdr.nla_type = LS_NLA_TYPE_STATUS; > + acmnlmsg.status[0].attr_hdr.nla_len = NLA_HDRLEN + > + sizeof(struct rdma_nla_ls_status); > + if (msg->hdr.status == ACM_STATUS_EINVAL) > + acmnlmsg.status[0].status.status = LS_NLA_STATUS_EINVAL; > + else > + acmnlmsg.status[0].status.status = > + LS_NLA_STATUS_ENODATA; > + } else { > + acm_log(2, "acm status success\n"); > + acmnlmsg.nlmsg_header.nlmsg_flags |= RDMA_NL_LS_F_OK; > + acmnlmsg.nlmsg_header.nlmsg_len += > + NLA_ALIGN(sizeof(struct acm_nl_path)); > + acmnlmsg.path[0].attr_hdr.nla_type = LS_NLA_TYPE_PATH_RECORD; > + acmnlmsg.path[0].attr_hdr.nla_len = sizeof(struct acm_nl_path); > + acmnlmsg.path[0].rec.flags = > + IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; > + memcpy(acmnlmsg.path[0].rec.path_rec, > + &msg->resolve_data[0].info.path, > + sizeof(struct ibv_path_record)); > + } > + > + datalen = NLMSG_ALIGN(acmnlmsg.nlmsg_header.nlmsg_len); > + ret = sendto(sock, &acmnlmsg, datalen, 0, > + (const struct sockaddr *)&dst_addr, > + (socklen_t)sizeof(dst_addr)); > + if (ret != datalen) { > + acm_log(0, "ERROR - sendto = %d errno = %d\n", ret, errno); > + ret = -1; > + } else { > + ret = msg->hdr.length; > + } > + > + free(orig); > + > + return ret; > +} > + > +#define NLA_LEN(nla) ((nla)->nla_len - NLA_HDRLEN) > +#define NLA_DATA(nla) ((char *)(nla) + NLA_HDRLEN) > + > +static void acm_nl_parse_path_attr(struct nlattr *attr, > + struct acm_ep_addr_data *data) Should this return status as to whether parsing was successful or not ? > +{ > + struct ibv_path_record *path; > + struct rdma_nla_ls_service_id *sid; > + struct rdma_nla_ls_gid *gid; > + struct rdma_nla_ls_tclass *tcl; > + struct rdma_nla_ls_reversible *rev; > + struct rdma_nla_ls_numb_path *npath; > + struct rdma_nla_ls_pkey *pkey; > + struct rdma_nla_ls_qos_class *qos; > + uint16_t val; > + > +#define IBV_PATH_RECORD_QOS_MASK 0xfff0 > + > + path = &data->info.path; > + switch (attr->nla_type & NLA_TYPE_MASK) { > + case LS_NLA_TYPE_SERVICE_ID: > + sid = (struct rdma_nla_ls_service_id *) NLA_DATA(attr); > + if (NLA_LEN(attr) == sizeof(sid->service_id)) { > + acm_log(2, "service_id 0x%llx\n", sid->service_id); > + path->service_id = sid->service_id; > + } > + break; > + > + case LS_NLA_TYPE_DGID: > + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); > + if (NLA_LEN(attr) == sizeof(gid->gid)) { > + acm_format_name(2, log_data, sizeof(log_data), > + ACM_ADDRESS_GID, gid->gid, > + sizeof(union ibv_gid)); > + acm_log(2, "path dgid %s\n", log_data); > + memcpy(path->dgid.raw, gid->gid, sizeof(path->dgid)); > + data->flags |= ACM_EP_FLAG_DEST; > + } > + break; > + > + case LS_NLA_TYPE_SGID: > + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); > + if (NLA_LEN(attr) == sizeof(gid->gid)) { > + acm_format_name(2, log_data, sizeof(log_data), > + ACM_ADDRESS_GID, gid->gid, > + sizeof(union ibv_gid)); > + acm_log(2, "path sgid %s\n", log_data); > + memcpy(path->sgid.raw, gid->gid, sizeof(path->sgid)); > + data->flags |= ACM_EP_FLAG_SOURCE; > + } > + break; > + > + case LS_NLA_TYPE_TCLASS: > + tcl = (struct rdma_nla_ls_tclass *) NLA_DATA(attr); > + if (NLA_LEN(attr) == sizeof(tcl->tclass)) { > + acm_log(2, "tclass 0x%x\n", tcl->tclass); > + path->tclass = tcl->tclass; > + } > + break; > + > + case LS_NLA_TYPE_REVERSIBLE: > + rev = (struct rdma_nla_ls_reversible *) NLA_DATA(attr); > + if (NLA_LEN(attr) == sizeof(rev->reversible)) { > + acm_log(2, "reversible 0x%x\n", rev->reversible); > + if (rev->reversible) > + path->reversible_numpath |= > + IBV_PATH_RECORD_REVERSIBLE; > + else > + path->reversible_numpath &= > + ~IBV_PATH_RECORD_REVERSIBLE; > + } > + break; > + > + case LS_NLA_TYPE_NUMB_PATH: > + npath = (struct rdma_nla_ls_numb_path *) NLA_DATA(attr); > + if (NLA_LEN(attr) == sizeof(npath->numb_path)) { > + acm_log(2, "numb_path %d\n", npath->numb_path); > + path->reversible_numpath &= IBV_PATH_RECORD_REVERSIBLE; > + path->reversible_numpath |= > + (npath->numb_path & > + (~IBV_PATH_RECORD_REVERSIBLE)); > + } > + break; > + > + case LS_NLA_TYPE_PKEY: > + pkey = (struct rdma_nla_ls_pkey *) NLA_DATA(attr); > + if (NLA_LEN(attr) == sizeof(pkey->pkey)) { > + acm_log(2, "pkey 0x%x\n", pkey->pkey); > + path->pkey = pkey->pkey; > + } > + break; > + > + case LS_NLA_TYPE_QOS_CLASS: > + qos = (struct rdma_nla_ls_qos_class *) NLA_DATA(attr); > + if (NLA_LEN(attr) == sizeof(qos->qos_class)) { > + acm_log(2, "qos_class 0x%x\n", qos->qos_class); > + val = ntohs(path->qosclass_sl); > + val &= ~IBV_PATH_RECORD_QOS_MASK; > + val |= (ntohs(qos->qos_class) & > + IBV_PATH_RECORD_QOS_MASK); > + path->qosclass_sl = htons(val); > + } > + break; > + > + default: > + acm_log(1, "WARN: unknown attr %x\n", attr->nla_type); > + break; Is this a parsing failure case ? > + } > +} > + > +static void acm_nl_process_resolve(struct acmc_client *client, > + struct acm_nl_msg *acmnlmsg) > +{ > + struct acm_msg msg; > + struct nlattr *attr; > + int payload_len; > + int rem; > + int total_attr_len; > + > + memset(&msg, 0, sizeof(msg)); > + msg.hdr.opcode = ACM_OP_RESOLVE; > + msg.hdr.version = ACM_VERSION; > + msg.hdr.length = ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH; > + msg.hdr.status = ACM_STATUS_SUCCESS; > + msg.hdr.tid = (uint64_t) acmnlmsg; > + msg.resolve_data[0].type = ACM_EP_INFO_PATH; > + > + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - NLMSG_HDRLEN; > + attr = NLMSG_DATA(&acmnlmsg->nlmsg_header); > + rem = payload_len; > + while (1) { > + if (rem < (int) sizeof(*attr) || > + attr->nla_len < sizeof(*attr) || > + attr->nla_len > rem) > + break; > + > + acm_nl_parse_path_attr(attr, &msg.resolve_data[0]); > + > + /* Next attribute */ > + total_attr_len = NLA_ALIGN(attr->nla_len); > + rem -= total_attr_len; > + attr = (struct nlattr *) ((char *) attr + total_attr_len); > + } > + Since ACM does not resolve multicast PRs, as an optimization here, some minor check of DGID could be done and if it's multicast DGID, ENODATA could be indicated in NL message. -- Hal > + atomic_inc(&counter[ACM_CNTR_RESOLVE]); > + acm_svr_resolve(client, &msg); > +} > + > +static void acm_nl_process_invalid_request(struct acmc_client *client, > + struct acm_nl_msg *acmnlmsg) > +{ > + struct acm_msg msg; > + > + memset(&msg, 0, sizeof(msg)); > + msg.hdr.opcode = ACM_OP_RESOLVE; > + msg.hdr.version = ACM_VERSION; > + msg.hdr.length = ACM_MSG_HDR_LENGTH; > + msg.hdr.status = ACM_STATUS_EINVAL; > + msg.hdr.tid = (uint64_t) acmnlmsg; > + > + acm_nl_send(client->sock, &msg); > +} > + > +static int acm_nl_is_valid_resolve_request(struct acm_nl_msg *acmnlmsg) > +{ > + int payload_len; > + > + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - NLMSG_HDRLEN; > + if (payload_len < sizeof(struct nlattr)) > + return 0; > + > + return 1; > +} > + > +static void acm_nl_receive(struct acmc_client *client) > +{ > + struct acm_nl_msg *acmnlmsg; > + int datalen = sizeof(*acmnlmsg); > + int ret; > + uint16_t client_inx, op; > + > + acmnlmsg = calloc(1, sizeof(*acmnlmsg)); > + if (!acmnlmsg) { > + acm_log(0, "Out of memory for recving nl msg.\n"); > + return; > + } > + ret = recv(client->sock, acmnlmsg, datalen, 0); > + if (!NLMSG_OK(&acmnlmsg->nlmsg_header, ret)) { > + acm_log(0, "Netlink receive error: %d.\n", ret); > + goto rcv_cleanup; > + } > + > + acm_log(2, "nlmsg: len %d type 0x%x flags 0x%x seq %d pid %d\n", > + acmnlmsg->nlmsg_header.nlmsg_len, > + acmnlmsg->nlmsg_header.nlmsg_type, > + acmnlmsg->nlmsg_header.nlmsg_flags, > + acmnlmsg->nlmsg_header.nlmsg_seq, > + acmnlmsg->nlmsg_header.nlmsg_pid); > + > + /* Currently we handle only request from the SA client */ > + client_inx = RDMA_NL_GET_CLIENT(acmnlmsg->nlmsg_header.nlmsg_type); > + op = RDMA_NL_GET_OP(acmnlmsg->nlmsg_header.nlmsg_type); > + if (client_inx != RDMA_NL_SA) > + goto rcv_cleanup; > + > + switch (op) { > + case RDMA_NL_LS_OP_RESOLVE: > + if (acm_nl_is_valid_resolve_request(acmnlmsg)) > + acm_nl_process_resolve(client, acmnlmsg); > + else > + acm_nl_process_invalid_request(client, acmnlmsg); > + break; > + default: > + /* Not supported*/ > + acm_log(1, "WARN - invalid opcode %x\n", op); > + acm_nl_process_invalid_request(client, acmnlmsg); > + break; > + } > + > + return; > +rcv_cleanup: > + free(acmnlmsg); > +} > + > +static int acm_init_nl(void) > +{ > + struct sockaddr_nl src_addr; > + int ret; > + SOCKET nl_rcv_socket; > + > + nl_rcv_socket = socket(PF_NETLINK, SOCK_RAW, NETLINK_RDMA); > + if (nl_rcv_socket == INVALID_SOCKET) { > + acm_log(0, "ERROR - unable to allocate netlink recv socket\n"); > + return socket_errno(); > + } > + > + memset(&src_addr, 0, sizeof(src_addr)); > + src_addr.nl_family = AF_NETLINK; > + src_addr.nl_pid = getpid(); > + src_addr.nl_groups = (1 << (RDMA_NL_GROUP_LS - 1)); > + > + ret = bind(nl_rcv_socket, (struct sockaddr *)&src_addr, > + sizeof(src_addr)); > + if (ret == SOCKET_ERROR) { > + acm_log(0, "ERROR - unable to bind netlink socket\n"); > + return socket_errno(); > + } > + > + /* init nl client structure */ > + client_array[NL_CLIENT_INDEX].sock = nl_rcv_socket; > + return 0; > +} > + > static void acm_server(void) > { > fd_set readfds; > @@ -1360,12 +1709,14 @@ static void acm_server(void) > acm_log(0, "ERROR - server listen failed\n"); > return; > } > + ret = acm_init_nl(); > + if (ret) > + acm_log(1, "Warn - Netlink init failed\n"); > > while (1) { > n = (int) listen_socket; > FD_ZERO(&readfds); > FD_SET(listen_socket, &readfds); > - > n = max(n, (int) ip_mon_socket); > FD_SET(ip_mon_socket, &readfds); > > @@ -1399,7 +1750,10 @@ static void acm_server(void) > if (client_array[i].sock != INVALID_SOCKET && > FD_ISSET(client_array[i].sock, &readfds)) { > acm_log(2, "receiving from client %d\n", i); > - acm_svr_receive(&client_array[i]); > + if (i == NL_CLIENT_INDEX) > + acm_nl_receive(&client_array[i]); > + else > + acm_svr_receive(&client_array[i]); > } > } > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Hal Rosenstock > Sent: Thursday, June 11, 2015 10:15 AM > > On 6/9/2015 10:57 AM, kaike.wan@intel.com wrote: > > From: Kaike Wan <kaike.wan@intel.com> > > > > This patch enables ibacm to process pathrecord queries through netlink. > > Since ibacm can cache pathrecords, this implementation provides an > > easy pathrecord cache for kernel components and therefore offers great > > performance advantage on large fabric systems. > > > > Signed-off-by: Kaike Wan <kaike.wan@intel.com> > > Signed-off-by: John Fleck <john.fleck@intel.com> > > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > Reviewed-by: Sean Hefty <sean.hefty@intel.com> > > --- > > +#define NLA_LEN(nla) ((nla)->nla_len - NLA_HDRLEN) > > +#define NLA_DATA(nla) ((char *)(nla) + NLA_HDRLEN) > > + > > +static void acm_nl_parse_path_attr(struct nlattr *attr, > > + struct acm_ep_addr_data *data) > > Should this return status as to whether parsing was successful or not ? > > > +{ > > + struct ibv_path_record *path; > > + struct rdma_nla_ls_service_id *sid; > > + struct rdma_nla_ls_gid *gid; > > + struct rdma_nla_ls_tclass *tcl; > > + struct rdma_nla_ls_reversible *rev; > > + struct rdma_nla_ls_numb_path *npath; > > + struct rdma_nla_ls_pkey *pkey; > > + struct rdma_nla_ls_qos_class *qos; > > + uint16_t val; > > + > > +#define IBV_PATH_RECORD_QOS_MASK 0xfff0 > > + > > + path = &data->info.path; > > + switch (attr->nla_type & NLA_TYPE_MASK) { > > + case LS_NLA_TYPE_SERVICE_ID: > > + sid = (struct rdma_nla_ls_service_id *) NLA_DATA(attr); > > + if (NLA_LEN(attr) == sizeof(sid->service_id)) { > > + acm_log(2, "service_id 0x%llx\n", sid->service_id); > > + path->service_id = sid->service_id; > > + } > > + break; > > + > > + case LS_NLA_TYPE_DGID: > > + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); > > + if (NLA_LEN(attr) == sizeof(gid->gid)) { > > + acm_format_name(2, log_data, sizeof(log_data), > > + ACM_ADDRESS_GID, gid->gid, > > + sizeof(union ibv_gid)); > > + acm_log(2, "path dgid %s\n", log_data); > > + memcpy(path->dgid.raw, gid->gid, sizeof(path- > >dgid)); > > + data->flags |= ACM_EP_FLAG_DEST; > > + } > > + break; > > + > > + case LS_NLA_TYPE_SGID: > > + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); > > + if (NLA_LEN(attr) == sizeof(gid->gid)) { > > + acm_format_name(2, log_data, sizeof(log_data), > > + ACM_ADDRESS_GID, gid->gid, > > + sizeof(union ibv_gid)); > > + acm_log(2, "path sgid %s\n", log_data); > > + memcpy(path->sgid.raw, gid->gid, sizeof(path->sgid)); > > + data->flags |= ACM_EP_FLAG_SOURCE; > > + } > > + break; > > + > > + case LS_NLA_TYPE_TCLASS: > > + tcl = (struct rdma_nla_ls_tclass *) NLA_DATA(attr); > > + if (NLA_LEN(attr) == sizeof(tcl->tclass)) { > > + acm_log(2, "tclass 0x%x\n", tcl->tclass); > > + path->tclass = tcl->tclass; > > + } > > + break; > > + > > + case LS_NLA_TYPE_REVERSIBLE: > > + rev = (struct rdma_nla_ls_reversible *) NLA_DATA(attr); > > + if (NLA_LEN(attr) == sizeof(rev->reversible)) { > > + acm_log(2, "reversible 0x%x\n", rev->reversible); > > + if (rev->reversible) > > + path->reversible_numpath |= > > + IBV_PATH_RECORD_REVERSIBLE; > > + else > > + path->reversible_numpath &= > > + ~IBV_PATH_RECORD_REVERSIBLE; > > + } > > + break; > > + > > + case LS_NLA_TYPE_NUMB_PATH: > > + npath = (struct rdma_nla_ls_numb_path *) NLA_DATA(attr); > > + if (NLA_LEN(attr) == sizeof(npath->numb_path)) { > > + acm_log(2, "numb_path %d\n", npath->numb_path); > > + path->reversible_numpath &= > IBV_PATH_RECORD_REVERSIBLE; > > + path->reversible_numpath |= > > + (npath->numb_path & > > + (~IBV_PATH_RECORD_REVERSIBLE)); > > + } > > + break; > > + > > + case LS_NLA_TYPE_PKEY: > > + pkey = (struct rdma_nla_ls_pkey *) NLA_DATA(attr); > > + if (NLA_LEN(attr) == sizeof(pkey->pkey)) { > > + acm_log(2, "pkey 0x%x\n", pkey->pkey); > > + path->pkey = pkey->pkey; > > + } > > + break; > > + > > + case LS_NLA_TYPE_QOS_CLASS: > > + qos = (struct rdma_nla_ls_qos_class *) NLA_DATA(attr); > > + if (NLA_LEN(attr) == sizeof(qos->qos_class)) { > > + acm_log(2, "qos_class 0x%x\n", qos->qos_class); > > + val = ntohs(path->qosclass_sl); > > + val &= ~IBV_PATH_RECORD_QOS_MASK; > > + val |= (ntohs(qos->qos_class) & > > + IBV_PATH_RECORD_QOS_MASK); > > + path->qosclass_sl = htons(val); > > + } > > + break; > > + > > + default: > > + acm_log(1, "WARN: unknown attr %x\n", attr->nla_type); > > + break; > > Is this a parsing failure case ? The error handling code, which was also suggested by Jason, will be added in next revision. > > > + } > > +} > > + > > +static void acm_nl_process_resolve(struct acmc_client *client, > > + struct acm_nl_msg *acmnlmsg) > > +{ > > + struct acm_msg msg; > > + struct nlattr *attr; > > + int payload_len; > > + int rem; > > + int total_attr_len; > > + > > + memset(&msg, 0, sizeof(msg)); > > + msg.hdr.opcode = ACM_OP_RESOLVE; > > + msg.hdr.version = ACM_VERSION; > > + msg.hdr.length = ACM_MSG_HDR_LENGTH + > ACM_MSG_EP_LENGTH; > > + msg.hdr.status = ACM_STATUS_SUCCESS; > > + msg.hdr.tid = (uint64_t) acmnlmsg; > > + msg.resolve_data[0].type = ACM_EP_INFO_PATH; > > + > > + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - > NLMSG_HDRLEN; > > + attr = NLMSG_DATA(&acmnlmsg->nlmsg_header); > > + rem = payload_len; > > + while (1) { > > + if (rem < (int) sizeof(*attr) || > > + attr->nla_len < sizeof(*attr) || > > + attr->nla_len > rem) > > + break; > > + > > + acm_nl_parse_path_attr(attr, &msg.resolve_data[0]); > > + > > + /* Next attribute */ > > + total_attr_len = NLA_ALIGN(attr->nla_len); > > + rem -= total_attr_len; > > + attr = (struct nlattr *) ((char *) attr + total_attr_len); > > + } > > + > > Since ACM does not resolve multicast PRs, Why not? The multicast gid will be used as the dgid and ibacm will not ask peers for address resolution. Instead, It will ask SA directly for the multicast pathrecord (route resolution only). I can't see why it can't be done here. > as an optimization here, some > minor check of DGID could be done and if it's multicast DGID, ENODATA > could be indicated in NL message. > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/11/2015 10:25 AM, Wan, Kaike wrote: >>> +static void acm_nl_process_resolve(struct acmc_client *client, >>> + struct acm_nl_msg *acmnlmsg) >>> +{ >>> + struct acm_msg msg; >>> + struct nlattr *attr; >>> + int payload_len; >>> + int rem; >>> + int total_attr_len; >>> + >>> + memset(&msg, 0, sizeof(msg)); >>> + msg.hdr.opcode = ACM_OP_RESOLVE; >>> + msg.hdr.version = ACM_VERSION; >>> + msg.hdr.length = ACM_MSG_HDR_LENGTH + >> ACM_MSG_EP_LENGTH; >>> + msg.hdr.status = ACM_STATUS_SUCCESS; >>> + msg.hdr.tid = (uint64_t) acmnlmsg; >>> + msg.resolve_data[0].type = ACM_EP_INFO_PATH; >>> + >>> + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - >> NLMSG_HDRLEN; >>> + attr = NLMSG_DATA(&acmnlmsg->nlmsg_header); >>> + rem = payload_len; >>> + while (1) { >>> + if (rem < (int) sizeof(*attr) || >>> + attr->nla_len < sizeof(*attr) || >>> + attr->nla_len > rem) >>> + break; >>> + >>> + acm_nl_parse_path_attr(attr, &msg.resolve_data[0]); >>> + >>> + /* Next attribute */ >>> + total_attr_len = NLA_ALIGN(attr->nla_len); >>> + rem -= total_attr_len; >>> + attr = (struct nlattr *) ((char *) attr + total_attr_len); >>> + } >>> + >> >> Since ACM does not resolve multicast PRs, > > Why not? The multicast gid will be used as the dgid and ibacm will not ask peers for address resolution. > Instead, It will ask SA directly for the multicast pathrecord (route resolution only). > I can't see why it can't be done here. Are you saying the local cache lookup fails so it falls back to ask SA for multicast PR ? If so, then perhaps there's provider optimization to fail that style of lookup rather than doing the tree lookup. >> as an optimization here, some >> minor check of DGID could be done and if it's multicast DGID, ENODATA >> could be indicated in NL message. >> > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: Hal Rosenstock [mailto:hal@dev.mellanox.co.il] > Sent: Thursday, June 11, 2015 11:53 AM > To: Wan, Kaike > Cc: linux-rdma@vger.kernel.org; Fleck, John; Weiny, Ira > Subject: Re: [PATCH v4 1/1] ibacm: Add support for pathrecord query > through netlink > > On 6/11/2015 10:25 AM, Wan, Kaike wrote: > > >>> +static void acm_nl_process_resolve(struct acmc_client *client, > >>> + struct acm_nl_msg *acmnlmsg) { > >>> + struct acm_msg msg; > >>> + struct nlattr *attr; > >>> + int payload_len; > >>> + int rem; > >>> + int total_attr_len; > >>> + > >>> + memset(&msg, 0, sizeof(msg)); > >>> + msg.hdr.opcode = ACM_OP_RESOLVE; > >>> + msg.hdr.version = ACM_VERSION; > >>> + msg.hdr.length = ACM_MSG_HDR_LENGTH + > >> ACM_MSG_EP_LENGTH; > >>> + msg.hdr.status = ACM_STATUS_SUCCESS; > >>> + msg.hdr.tid = (uint64_t) acmnlmsg; > >>> + msg.resolve_data[0].type = ACM_EP_INFO_PATH; > >>> + > >>> + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - > >> NLMSG_HDRLEN; > >>> + attr = NLMSG_DATA(&acmnlmsg->nlmsg_header); > >>> + rem = payload_len; > >>> + while (1) { > >>> + if (rem < (int) sizeof(*attr) || > >>> + attr->nla_len < sizeof(*attr) || > >>> + attr->nla_len > rem) > >>> + break; > >>> + > >>> + acm_nl_parse_path_attr(attr, &msg.resolve_data[0]); > >>> + > >>> + /* Next attribute */ > >>> + total_attr_len = NLA_ALIGN(attr->nla_len); > >>> + rem -= total_attr_len; > >>> + attr = (struct nlattr *) ((char *) attr + total_attr_len); > >>> + } > >>> + > >> > >> Since ACM does not resolve multicast PRs, > > > > Why not? The multicast gid will be used as the dgid and ibacm will not ask > peers for address resolution. > > Instead, It will ask SA directly for the multicast pathrecord (route resolution > only). > > I can't see why it can't be done here. > > Are you saying the local cache lookup fails so it falls back to ask SA for > multicast PR ? Yes, that's the current implementation for acmp (the default multicast provider). For other providers, it may simply fail the request. Overall, from the ibacm core's point of view (where the netlink API is implemented), there is no reason to reject the request for a multicast PR and the decision should be passed to the individual provider. Kaike > If so, then perhaps there's provider optimization to fail that > style of lookup rather than doing the tree lookup. > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/src/acm.c b/src/acm.c index 7649725..d180bd2 100644 --- a/src/acm.c +++ b/src/acm.c @@ -46,6 +46,8 @@ #include <infiniband/acm_prov.h> #include <infiniband/umad.h> #include <infiniband/verbs.h> +#include <infiniband/umad_types.h> +#include <infiniband/umad_sa.h> #include <dlist.h> #include <dlfcn.h> #include <search.h> @@ -55,6 +57,8 @@ #include <netinet/in.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> +#include <rdma/rdma_netlink.h> +#include <rdma/ib_user_sa.h> #include <poll.h> #include "acm_mad.h" #include "acm_util.h" @@ -66,6 +70,7 @@ #define MAX_EP_ADDR 4 #define NL_MSG_BUF_SIZE 4096 #define ACM_PROV_NAME_SIZE 64 +#define NL_CLIENT_INDEX 0 struct acmc_subnet { DLIST_ENTRY entry; @@ -151,6 +156,26 @@ struct acmc_sa_req { struct acm_sa_mad mad; }; +struct acm_nl_status { + struct nlattr attr_hdr; + struct rdma_nla_ls_status status; +}; + +struct acm_nl_path { + struct nlattr attr_hdr; + struct ib_path_rec_data rec; +}; + +struct acm_nl_msg { + struct nlmsghdr nlmsg_header; + union { + uint8_t data[ACM_MSG_DATA_LENGTH]; + struct nlattr attr[0]; + struct acm_nl_status status[0]; + struct acm_nl_path path[0]; + }; +}; + static char def_prov_name[ACM_PROV_NAME_SIZE] = "ibacmp"; static DLIST_ENTRY provider_list; static struct acmc_prov *def_provider = NULL; @@ -172,6 +197,7 @@ static struct acmc_ep *acm_find_ep(struct acmc_port *port, uint16_t pkey); static int acm_ep_insert_addr(struct acmc_ep *ep, const char *name, uint8_t *addr, size_t addr_len, uint8_t addr_type); static void acm_event_handler(struct acmc_device *dev); +static int acm_nl_send(SOCKET sock, struct acm_msg *msg); static struct sa_data { int timeout; @@ -466,7 +492,11 @@ int acm_resolve_response(uint64_t id, struct acm_msg *msg) goto release; } - ret = send(client->sock, (char *) msg, msg->hdr.length, 0); + if (id == NL_CLIENT_INDEX) + ret = acm_nl_send(client->sock, msg); + else + ret = send(client->sock, (char *) msg, msg->hdr.length, 0); + if (ret != msg->hdr.length) acm_log(0, "ERROR - failed to send response\n"); else @@ -597,6 +627,8 @@ static void acm_svr_accept(void) } for (i = 0; i < FD_SETSIZE - 1; i++) { + if (i == NL_CLIENT_INDEX) + continue; if (!atomic_get(&client_array[i].refcnt)) break; } @@ -1346,6 +1378,323 @@ static void acm_ipnl_handler(void) } } +static int acm_nl_send(SOCKET sock, struct acm_msg *msg) +{ + struct sockaddr_nl dst_addr; + struct acm_nl_msg acmnlmsg; + struct acm_nl_msg *orig; + int ret; + int datalen; + + orig = (struct acm_nl_msg *) msg->hdr.tid; + + memset(&dst_addr, 0, sizeof(dst_addr)); + dst_addr.nl_family = AF_NETLINK; + dst_addr.nl_groups = (1 << (RDMA_NL_GROUP_LS - 1)); + + memset(&acmnlmsg, 0, sizeof(acmnlmsg)); + acmnlmsg.nlmsg_header.nlmsg_len = NLMSG_HDRLEN; + acmnlmsg.nlmsg_header.nlmsg_pid = getpid(); + acmnlmsg.nlmsg_header.nlmsg_type = orig->nlmsg_header.nlmsg_type; + acmnlmsg.nlmsg_header.nlmsg_flags = NLM_F_REQUEST; + acmnlmsg.nlmsg_header.nlmsg_seq = orig->nlmsg_header.nlmsg_seq; + + if (msg->hdr.status != ACM_STATUS_SUCCESS) { + acm_log(2, "acm status no success = %d\n", msg->hdr.status); + acmnlmsg.nlmsg_header.nlmsg_flags |= RDMA_NL_LS_F_ERR; + acmnlmsg.nlmsg_header.nlmsg_len += + NLA_ALIGN(sizeof(struct acm_nl_status)); + acmnlmsg.status[0].attr_hdr.nla_type = LS_NLA_TYPE_STATUS; + acmnlmsg.status[0].attr_hdr.nla_len = NLA_HDRLEN + + sizeof(struct rdma_nla_ls_status); + if (msg->hdr.status == ACM_STATUS_EINVAL) + acmnlmsg.status[0].status.status = LS_NLA_STATUS_EINVAL; + else + acmnlmsg.status[0].status.status = + LS_NLA_STATUS_ENODATA; + } else { + acm_log(2, "acm status success\n"); + acmnlmsg.nlmsg_header.nlmsg_flags |= RDMA_NL_LS_F_OK; + acmnlmsg.nlmsg_header.nlmsg_len += + NLA_ALIGN(sizeof(struct acm_nl_path)); + acmnlmsg.path[0].attr_hdr.nla_type = LS_NLA_TYPE_PATH_RECORD; + acmnlmsg.path[0].attr_hdr.nla_len = sizeof(struct acm_nl_path); + acmnlmsg.path[0].rec.flags = + IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; + memcpy(acmnlmsg.path[0].rec.path_rec, + &msg->resolve_data[0].info.path, + sizeof(struct ibv_path_record)); + } + + datalen = NLMSG_ALIGN(acmnlmsg.nlmsg_header.nlmsg_len); + ret = sendto(sock, &acmnlmsg, datalen, 0, + (const struct sockaddr *)&dst_addr, + (socklen_t)sizeof(dst_addr)); + if (ret != datalen) { + acm_log(0, "ERROR - sendto = %d errno = %d\n", ret, errno); + ret = -1; + } else { + ret = msg->hdr.length; + } + + free(orig); + + return ret; +} + +#define NLA_LEN(nla) ((nla)->nla_len - NLA_HDRLEN) +#define NLA_DATA(nla) ((char *)(nla) + NLA_HDRLEN) + +static void acm_nl_parse_path_attr(struct nlattr *attr, + struct acm_ep_addr_data *data) +{ + struct ibv_path_record *path; + struct rdma_nla_ls_service_id *sid; + struct rdma_nla_ls_gid *gid; + struct rdma_nla_ls_tclass *tcl; + struct rdma_nla_ls_reversible *rev; + struct rdma_nla_ls_numb_path *npath; + struct rdma_nla_ls_pkey *pkey; + struct rdma_nla_ls_qos_class *qos; + uint16_t val; + +#define IBV_PATH_RECORD_QOS_MASK 0xfff0 + + path = &data->info.path; + switch (attr->nla_type & NLA_TYPE_MASK) { + case LS_NLA_TYPE_SERVICE_ID: + sid = (struct rdma_nla_ls_service_id *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(sid->service_id)) { + acm_log(2, "service_id 0x%llx\n", sid->service_id); + path->service_id = sid->service_id; + } + break; + + case LS_NLA_TYPE_DGID: + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(gid->gid)) { + acm_format_name(2, log_data, sizeof(log_data), + ACM_ADDRESS_GID, gid->gid, + sizeof(union ibv_gid)); + acm_log(2, "path dgid %s\n", log_data); + memcpy(path->dgid.raw, gid->gid, sizeof(path->dgid)); + data->flags |= ACM_EP_FLAG_DEST; + } + break; + + case LS_NLA_TYPE_SGID: + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(gid->gid)) { + acm_format_name(2, log_data, sizeof(log_data), + ACM_ADDRESS_GID, gid->gid, + sizeof(union ibv_gid)); + acm_log(2, "path sgid %s\n", log_data); + memcpy(path->sgid.raw, gid->gid, sizeof(path->sgid)); + data->flags |= ACM_EP_FLAG_SOURCE; + } + break; + + case LS_NLA_TYPE_TCLASS: + tcl = (struct rdma_nla_ls_tclass *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(tcl->tclass)) { + acm_log(2, "tclass 0x%x\n", tcl->tclass); + path->tclass = tcl->tclass; + } + break; + + case LS_NLA_TYPE_REVERSIBLE: + rev = (struct rdma_nla_ls_reversible *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(rev->reversible)) { + acm_log(2, "reversible 0x%x\n", rev->reversible); + if (rev->reversible) + path->reversible_numpath |= + IBV_PATH_RECORD_REVERSIBLE; + else + path->reversible_numpath &= + ~IBV_PATH_RECORD_REVERSIBLE; + } + break; + + case LS_NLA_TYPE_NUMB_PATH: + npath = (struct rdma_nla_ls_numb_path *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(npath->numb_path)) { + acm_log(2, "numb_path %d\n", npath->numb_path); + path->reversible_numpath &= IBV_PATH_RECORD_REVERSIBLE; + path->reversible_numpath |= + (npath->numb_path & + (~IBV_PATH_RECORD_REVERSIBLE)); + } + break; + + case LS_NLA_TYPE_PKEY: + pkey = (struct rdma_nla_ls_pkey *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(pkey->pkey)) { + acm_log(2, "pkey 0x%x\n", pkey->pkey); + path->pkey = pkey->pkey; + } + break; + + case LS_NLA_TYPE_QOS_CLASS: + qos = (struct rdma_nla_ls_qos_class *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(qos->qos_class)) { + acm_log(2, "qos_class 0x%x\n", qos->qos_class); + val = ntohs(path->qosclass_sl); + val &= ~IBV_PATH_RECORD_QOS_MASK; + val |= (ntohs(qos->qos_class) & + IBV_PATH_RECORD_QOS_MASK); + path->qosclass_sl = htons(val); + } + break; + + default: + acm_log(1, "WARN: unknown attr %x\n", attr->nla_type); + break; + } +} + +static void acm_nl_process_resolve(struct acmc_client *client, + struct acm_nl_msg *acmnlmsg) +{ + struct acm_msg msg; + struct nlattr *attr; + int payload_len; + int rem; + int total_attr_len; + + memset(&msg, 0, sizeof(msg)); + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.version = ACM_VERSION; + msg.hdr.length = ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH; + msg.hdr.status = ACM_STATUS_SUCCESS; + msg.hdr.tid = (uint64_t) acmnlmsg; + msg.resolve_data[0].type = ACM_EP_INFO_PATH; + + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - NLMSG_HDRLEN; + attr = NLMSG_DATA(&acmnlmsg->nlmsg_header); + rem = payload_len; + while (1) { + if (rem < (int) sizeof(*attr) || + attr->nla_len < sizeof(*attr) || + attr->nla_len > rem) + break; + + acm_nl_parse_path_attr(attr, &msg.resolve_data[0]); + + /* Next attribute */ + total_attr_len = NLA_ALIGN(attr->nla_len); + rem -= total_attr_len; + attr = (struct nlattr *) ((char *) attr + total_attr_len); + } + + atomic_inc(&counter[ACM_CNTR_RESOLVE]); + acm_svr_resolve(client, &msg); +} + +static void acm_nl_process_invalid_request(struct acmc_client *client, + struct acm_nl_msg *acmnlmsg) +{ + struct acm_msg msg; + + memset(&msg, 0, sizeof(msg)); + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.version = ACM_VERSION; + msg.hdr.length = ACM_MSG_HDR_LENGTH; + msg.hdr.status = ACM_STATUS_EINVAL; + msg.hdr.tid = (uint64_t) acmnlmsg; + + acm_nl_send(client->sock, &msg); +} + +static int acm_nl_is_valid_resolve_request(struct acm_nl_msg *acmnlmsg) +{ + int payload_len; + + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - NLMSG_HDRLEN; + if (payload_len < sizeof(struct nlattr)) + return 0; + + return 1; +} + +static void acm_nl_receive(struct acmc_client *client) +{ + struct acm_nl_msg *acmnlmsg; + int datalen = sizeof(*acmnlmsg); + int ret; + uint16_t client_inx, op; + + acmnlmsg = calloc(1, sizeof(*acmnlmsg)); + if (!acmnlmsg) { + acm_log(0, "Out of memory for recving nl msg.\n"); + return; + } + ret = recv(client->sock, acmnlmsg, datalen, 0); + if (!NLMSG_OK(&acmnlmsg->nlmsg_header, ret)) { + acm_log(0, "Netlink receive error: %d.\n", ret); + goto rcv_cleanup; + } + + acm_log(2, "nlmsg: len %d type 0x%x flags 0x%x seq %d pid %d\n", + acmnlmsg->nlmsg_header.nlmsg_len, + acmnlmsg->nlmsg_header.nlmsg_type, + acmnlmsg->nlmsg_header.nlmsg_flags, + acmnlmsg->nlmsg_header.nlmsg_seq, + acmnlmsg->nlmsg_header.nlmsg_pid); + + /* Currently we handle only request from the SA client */ + client_inx = RDMA_NL_GET_CLIENT(acmnlmsg->nlmsg_header.nlmsg_type); + op = RDMA_NL_GET_OP(acmnlmsg->nlmsg_header.nlmsg_type); + if (client_inx != RDMA_NL_SA) + goto rcv_cleanup; + + switch (op) { + case RDMA_NL_LS_OP_RESOLVE: + if (acm_nl_is_valid_resolve_request(acmnlmsg)) + acm_nl_process_resolve(client, acmnlmsg); + else + acm_nl_process_invalid_request(client, acmnlmsg); + break; + default: + /* Not supported*/ + acm_log(1, "WARN - invalid opcode %x\n", op); + acm_nl_process_invalid_request(client, acmnlmsg); + break; + } + + return; +rcv_cleanup: + free(acmnlmsg); +} + +static int acm_init_nl(void) +{ + struct sockaddr_nl src_addr; + int ret; + SOCKET nl_rcv_socket; + + nl_rcv_socket = socket(PF_NETLINK, SOCK_RAW, NETLINK_RDMA); + if (nl_rcv_socket == INVALID_SOCKET) { + acm_log(0, "ERROR - unable to allocate netlink recv socket\n"); + return socket_errno(); + } + + memset(&src_addr, 0, sizeof(src_addr)); + src_addr.nl_family = AF_NETLINK; + src_addr.nl_pid = getpid(); + src_addr.nl_groups = (1 << (RDMA_NL_GROUP_LS - 1)); + + ret = bind(nl_rcv_socket, (struct sockaddr *)&src_addr, + sizeof(src_addr)); + if (ret == SOCKET_ERROR) { + acm_log(0, "ERROR - unable to bind netlink socket\n"); + return socket_errno(); + } + + /* init nl client structure */ + client_array[NL_CLIENT_INDEX].sock = nl_rcv_socket; + return 0; +} + static void acm_server(void) { fd_set readfds; @@ -1360,12 +1709,14 @@ static void acm_server(void) acm_log(0, "ERROR - server listen failed\n"); return; } + ret = acm_init_nl(); + if (ret) + acm_log(1, "Warn - Netlink init failed\n"); while (1) { n = (int) listen_socket; FD_ZERO(&readfds); FD_SET(listen_socket, &readfds); - n = max(n, (int) ip_mon_socket); FD_SET(ip_mon_socket, &readfds); @@ -1399,7 +1750,10 @@ static void acm_server(void) if (client_array[i].sock != INVALID_SOCKET && FD_ISSET(client_array[i].sock, &readfds)) { acm_log(2, "receiving from client %d\n", i); - acm_svr_receive(&client_array[i]); + if (i == NL_CLIENT_INDEX) + acm_nl_receive(&client_array[i]); + else + acm_svr_receive(&client_array[i]); } }