Message ID | 20200710140612.14749.47766.stgit@klimt.1015granger.net (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | IB CM tracepoints | expand |
> On Jul 10, 2020, at 10:06 AM, Chuck Lever <chuck.lever@oracle.com> wrote: > > In the interest of converging on a common instrumentation > infrastructure, modernize the pr_debug() call sites added by commit > 119bf81793ea ("IB/cm: Add debug prints to ib_cm"). The new > tracepoints appear in a new "ib_cma" subsystem. > > The conversion is somewhat mechanical. Someone more familiar with > the semantics of the recorded information might suggest additional > data capture. > > Some benefits include: > - Tracepoints enable "always on" reporting of these errors > - The error records are structured and compact > - Tracepoints provide hooks for eBPF scripts > > Sample output: > > nfsd-1954 [003] 62.017901: cm_send_dreq_err: local_id=1998890974 remote_id=1129750393 state=DREQ_RCVD lap_state=LAP_UNINIT Oops. I renamed this trace point cm_dreq_skipped and neglected to update the patch description. > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > drivers/infiniband/core/Makefile | 2 > drivers/infiniband/core/cm.c | 80 +++------ > drivers/infiniband/core/cm_trace.c | 15 ++ > drivers/infiniband/core/cm_trace.h | 309 ++++++++++++++++++++++++++++++++++++ > 4 files changed, 351 insertions(+), 55 deletions(-) > create mode 100644 drivers/infiniband/core/cm_trace.c > create mode 100644 drivers/infiniband/core/cm_trace.h > > diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile > index 24cb71a16a28..ccf2670ef45e 100644 > --- a/drivers/infiniband/core/Makefile > +++ b/drivers/infiniband/core/Makefile > @@ -17,7 +17,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ > ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o > ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o > > -ib_cm-y := cm.o > +ib_cm-y := cm.o cm_trace.o > > iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o > > diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c > index 0d1377232933..8dd8039e1a02 100644 > --- a/drivers/infiniband/core/cm.c > +++ b/drivers/infiniband/core/cm.c > @@ -27,6 +27,7 @@ > #include <rdma/ib_cm.h> > #include "cm_msgs.h" > #include "core_priv.h" > +#include "cm_trace.h" > > MODULE_AUTHOR("Sean Hefty"); > MODULE_DESCRIPTION("InfiniBand CM"); > @@ -2124,8 +2125,7 @@ static int cm_req_handler(struct cm_work *work) > > listen_cm_id_priv = cm_match_req(work, cm_id_priv); > if (!listen_cm_id_priv) { > - pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__, > - be32_to_cpu(cm_id_priv->id.local_id)); > + trace_cm_no_listener_err(&cm_id_priv->id); > cm_id_priv->id.state = IB_CM_IDLE; > ret = -EINVAL; > goto destroy; > @@ -2274,8 +2274,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, > spin_lock_irqsave(&cm_id_priv->lock, flags); > if (cm_id->state != IB_CM_REQ_RCVD && > cm_id->state != IB_CM_MRA_REQ_SENT) { > - pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__, > - be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); > + trace_cm_send_rep_err(cm_id_priv->id.local_id, cm_id->state); > ret = -EINVAL; > goto out; > } > @@ -2348,8 +2347,7 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, > spin_lock_irqsave(&cm_id_priv->lock, flags); > if (cm_id->state != IB_CM_REP_RCVD && > cm_id->state != IB_CM_MRA_REP_SENT) { > - pr_debug("%s: local_id %d, cm_id->state %d\n", __func__, > - be32_to_cpu(cm_id->local_id), cm_id->state); > + trace_cm_send_cm_rtu_err(cm_id); > ret = -EINVAL; > goto error; > } > @@ -2465,7 +2463,7 @@ static int cm_rep_handler(struct cm_work *work) > cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); > if (!cm_id_priv) { > cm_dup_rep_handler(work); > - pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, > + trace_cm_remote_no_priv_err( > IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); > return -EINVAL; > } > @@ -2479,11 +2477,10 @@ static int cm_rep_handler(struct cm_work *work) > break; > default: > ret = -EINVAL; > - pr_debug( > - "%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n", > - __func__, cm_id_priv->id.state, > + trace_cm_rep_unknown_err( > IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), > - IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); > + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg), > + cm_id_priv->id.state); > spin_unlock_irq(&cm_id_priv->lock); > goto error; > } > @@ -2500,7 +2497,7 @@ static int cm_rep_handler(struct cm_work *work) > spin_unlock(&cm.lock); > spin_unlock_irq(&cm_id_priv->lock); > ret = -EINVAL; > - pr_debug("%s: Failed to insert remote id %d\n", __func__, > + trace_cm_insert_failed_err( > IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); > goto error; > } > @@ -2517,9 +2514,8 @@ static int cm_rep_handler(struct cm_work *work) > IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, > NULL, 0); > ret = -EINVAL; > - pr_debug( > - "%s: Stale connection. local_comm_id %d, remote_comm_id %d\n", > - __func__, IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), > + trace_cm_staleconn_err( > + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), > IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); > > if (cur_cm_id_priv) { > @@ -2646,9 +2642,7 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, > return -EINVAL; > > if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { > - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, > - be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_dreq_skipped(&cm_id_priv->id); > return -EINVAL; > } > > @@ -2722,10 +2716,7 @@ static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, > return -EINVAL; > > if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) { > - pr_debug( > - "%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n", > - __func__, be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_send_drep_err(&cm_id_priv->id); > kfree(private_data); > return -EINVAL; > } > @@ -2810,9 +2801,8 @@ static int cm_dreq_handler(struct cm_work *work) > atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. > counter[CM_DREQ_COUNTER]); > cm_issue_drep(work->port, work->mad_recv_wc); > - pr_debug( > - "%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n", > - __func__, IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), > + trace_cm_no_priv_err( > + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), > IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); > return -EINVAL; > } > @@ -2858,9 +2848,7 @@ static int cm_dreq_handler(struct cm_work *work) > counter[CM_DREQ_COUNTER]); > goto unlock; > default: > - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", > - __func__, be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_dreq_unknown_err(&cm_id_priv->id); > goto unlock; > } > cm_id_priv->id.state = IB_CM_DREQ_RCVD; > @@ -2945,9 +2933,7 @@ static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, > state); > break; > default: > - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, > - be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_send_unknown_rej_err(&cm_id_priv->id); > return -EINVAL; > } > > @@ -3060,9 +3046,7 @@ static int cm_rej_handler(struct cm_work *work) > } > /* fall through */ > default: > - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", > - __func__, be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_rej_unknown_err(&cm_id_priv->id); > spin_unlock_irq(&cm_id_priv->lock); > goto out; > } > @@ -3118,9 +3102,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, > } > /* fall through */ > default: > - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", > - __func__, be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_send_mra_unknown_err(&cm_id_priv->id); > ret = -EINVAL; > goto error1; > } > @@ -3229,9 +3211,7 @@ static int cm_mra_handler(struct cm_work *work) > counter[CM_MRA_COUNTER]); > /* fall through */ > default: > - pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n", > - __func__, be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_mra_unknown_err(&cm_id_priv->id); > goto out; > } > > @@ -3765,8 +3745,7 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg, > if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) > goto discard; > > - pr_debug_ratelimited("CM: failed sending MAD in state %d. (%s)\n", > - state, ib_wc_status_msg(wc_status)); > + trace_cm_mad_send_err(state, wc_status); > switch (state) { > case IB_CM_REQ_SENT: > case IB_CM_MRA_REQ_RCVD: > @@ -3889,7 +3868,7 @@ static void cm_work_handler(struct work_struct *_work) > ret = cm_timewait_handler(work); > break; > default: > - pr_debug("cm_event.event: 0x%x\n", work->cm_event.event); > + trace_cm_handler_err(work->cm_event.event); > ret = -EINVAL; > break; > } > @@ -3925,8 +3904,7 @@ static int cm_establish(struct ib_cm_id *cm_id) > ret = -EISCONN; > break; > default: > - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, > - be32_to_cpu(cm_id->local_id), cm_id->state); > + trace_cm_establish_err(cm_id); > ret = -EINVAL; > break; > } > @@ -4123,9 +4101,7 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, > ret = 0; > break; > default: > - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", > - __func__, be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_qp_init_err(&cm_id_priv->id); > ret = -EINVAL; > break; > } > @@ -4173,9 +4149,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, > ret = 0; > break; > default: > - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", > - __func__, be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_qp_rtr_err(&cm_id_priv->id); > ret = -EINVAL; > break; > } > @@ -4235,9 +4209,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, > ret = 0; > break; > default: > - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", > - __func__, be32_to_cpu(cm_id_priv->id.local_id), > - cm_id_priv->id.state); > + trace_cm_qp_rts_err(&cm_id_priv->id); > ret = -EINVAL; > break; > } > diff --git a/drivers/infiniband/core/cm_trace.c b/drivers/infiniband/core/cm_trace.c > new file mode 100644 > index 000000000000..8f3482f66338 > --- /dev/null > +++ b/drivers/infiniband/core/cm_trace.c > @@ -0,0 +1,15 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Trace points for the IB Connection Manager. > + * > + * Author: Chuck Lever <chuck.lever@oracle.com> > + * > + * Copyright (c) 2020, Oracle and/or its affiliates. > + */ > + > +#include <rdma/rdma_cm.h> > +#include "cma_priv.h" > + > +#define CREATE_TRACE_POINTS > + > +#include "cm_trace.h" > diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h > new file mode 100644 > index 000000000000..84f65f597e34 > --- /dev/null > +++ b/drivers/infiniband/core/cm_trace.h > @@ -0,0 +1,309 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Trace point definitions for the RDMA Connect Manager. > + * > + * Author: Chuck Lever <chuck.lever@oracle.com> > + * > + * Copyright (c) 2020 Oracle and/or its affiliates. > + */ > + > +#undef TRACE_SYSTEM > +#define TRACE_SYSTEM ib_cma > + > +#if !defined(_TRACE_IB_CMA_H) || defined(TRACE_HEADER_MULTI_READ) > + > +#define _TRACE_IB_CMA_H > + > +#include <linux/tracepoint.h> > +#include <rdma/ib_cm.h> > +#include <trace/events/rdma.h> > + > +/* > + * enum ib_cm_state, from include/rdma/ib_cm.h > + */ > +#define IB_CM_STATE_LIST \ > + ib_cm_state(IDLE) \ > + ib_cm_state(LISTEN) \ > + ib_cm_state(REQ_SENT) \ > + ib_cm_state(REQ_RCVD) \ > + ib_cm_state(MRA_REQ_SENT) \ > + ib_cm_state(MRA_REQ_RCVD) \ > + ib_cm_state(REP_SENT) \ > + ib_cm_state(REP_RCVD) \ > + ib_cm_state(MRA_REP_SENT) \ > + ib_cm_state(MRA_REP_RCVD) \ > + ib_cm_state(ESTABLISHED) \ > + ib_cm_state(DREQ_SENT) \ > + ib_cm_state(DREQ_RCVD) \ > + ib_cm_state(TIMEWAIT) \ > + ib_cm_state(SIDR_REQ_SENT) \ > + ib_cm_state_end(SIDR_REQ_RCVD) > + > +#undef ib_cm_state > +#undef ib_cm_state_end > +#define ib_cm_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); > +#define ib_cm_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); > + > +IB_CM_STATE_LIST > + > +#undef ib_cm_state > +#undef ib_cm_state_end > +#define ib_cm_state(x) { IB_CM_##x, #x }, > +#define ib_cm_state_end(x) { IB_CM_##x, #x } > + > +#define show_ib_cm_state(x) \ > + __print_symbolic(x, IB_CM_STATE_LIST) > + > +/* > + * enum ib_cm_lap_state, from include/rdma/ib_cm.h > + */ > +#define IB_CM_LAP_STATE_LIST \ > + ib_cm_lap_state(LAP_UNINIT) \ > + ib_cm_lap_state(LAP_IDLE) \ > + ib_cm_lap_state(LAP_SENT) \ > + ib_cm_lap_state(LAP_RCVD) \ > + ib_cm_lap_state(MRA_LAP_SENT) \ > + ib_cm_lap_state_end(MRA_LAP_RCVD) > + > +#undef ib_cm_lap_state > +#undef ib_cm_lap_state_end > +#define ib_cm_lap_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); > +#define ib_cm_lap_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); > + > +IB_CM_LAP_STATE_LIST > + > +#undef ib_cm_lap_state > +#undef ib_cm_lap_state_end > +#define ib_cm_lap_state(x) { IB_CM_##x, #x }, > +#define ib_cm_lap_state_end(x) { IB_CM_##x, #x } > + > +#define show_ib_cm_lap_state(x) \ > + __print_symbolic(x, IB_CM_LAP_STATE_LIST) > + > + > +DECLARE_EVENT_CLASS(cm_id_class, > + TP_PROTO( > + const struct ib_cm_id *cm_id > + ), > + > + TP_ARGS(cm_id), > + > + TP_STRUCT__entry( > + __field(const void *, cm_id) /* for eBPF scripts */ > + __field(unsigned int, local_id) > + __field(unsigned int, remote_id) > + __field(unsigned long, state) > + __field(unsigned long, lap_state) > + ), > + > + TP_fast_assign( > + __entry->cm_id = cm_id; > + __entry->local_id = be32_to_cpu(cm_id->local_id); > + __entry->remote_id = be32_to_cpu(cm_id->remote_id); > + __entry->state = cm_id->state; > + __entry->lap_state = cm_id->lap_state; > + ), > + > + TP_printk("local_id=%u remote_id=%u state=%s lap_state=%s", > + __entry->local_id, __entry->remote_id, > + show_ib_cm_state(__entry->state), > + show_ib_cm_lap_state(__entry->lap_state) > + ) > +); > + > +#define DEFINE_CM_ERR_EVENT(name) \ > + DEFINE_EVENT(cm_id_class, \ > + cm_##name##_err, \ > + TP_PROTO( \ > + const struct ib_cm_id *cm_id \ > + ), \ > + TP_ARGS(cm_id)) > + > +DEFINE_CM_ERR_EVENT(send_cm_rtu); > +DEFINE_CM_ERR_EVENT(establish); > +DEFINE_CM_ERR_EVENT(no_listener); > +DEFINE_CM_ERR_EVENT(send_drep); > +DEFINE_CM_ERR_EVENT(dreq_unknown); > +DEFINE_CM_ERR_EVENT(send_unknown_rej); > +DEFINE_CM_ERR_EVENT(rej_unknown); > +DEFINE_CM_ERR_EVENT(send_mra_unknown); > +DEFINE_CM_ERR_EVENT(mra_unknown); > +DEFINE_CM_ERR_EVENT(qp_init); > +DEFINE_CM_ERR_EVENT(qp_rtr); > +DEFINE_CM_ERR_EVENT(qp_rts); > + > +DEFINE_EVENT(cm_id_class, \ > + cm_dreq_skipped, \ > + TP_PROTO( \ > + const struct ib_cm_id *cm_id \ > + ), \ > + TP_ARGS(cm_id) \ > +); > + > +DECLARE_EVENT_CLASS(cm_local_class, > + TP_PROTO( > + unsigned int local_id, > + unsigned int remote_id > + ), > + > + TP_ARGS(local_id, remote_id), > + > + TP_STRUCT__entry( > + __field(unsigned int, local_id) > + __field(unsigned int, remote_id) > + ), > + > + TP_fast_assign( > + __entry->local_id = local_id; > + __entry->remote_id = remote_id; > + ), > + > + TP_printk("local_id=%u remote_id=%u", > + __entry->local_id, __entry->remote_id > + ) > +); > + > +#define DEFINE_CM_LOCAL_EVENT(name) \ > + DEFINE_EVENT(cm_local_class, \ > + cm_##name, \ > + TP_PROTO( \ > + unsigned int local_id, \ > + unsigned int remote_id \ > + ), \ > + TP_ARGS(local_id, remote_id)) > + > +DEFINE_CM_LOCAL_EVENT(staleconn_err); > +DEFINE_CM_LOCAL_EVENT(no_priv_err); > + > +DECLARE_EVENT_CLASS(cm_remote_class, > + TP_PROTO( > + u32 remote_id > + ), > + > + TP_ARGS(remote_id), > + > + TP_STRUCT__entry( > + __field(u32, remote_id) > + ), > + > + TP_fast_assign( > + __entry->remote_id = remote_id; > + ), > + > + TP_printk("remote_id=%u", > + __entry->remote_id > + ) > +); > + > +#define DEFINE_CM_REMOTE_EVENT(name) \ > + DEFINE_EVENT(cm_remote_class, \ > + cm_##name, \ > + TP_PROTO( \ > + u32 remote_id \ > + ), \ > + TP_ARGS(remote_id)) > + > +DEFINE_CM_REMOTE_EVENT(remote_no_priv_err); > +DEFINE_CM_REMOTE_EVENT(insert_failed_err); > + > +TRACE_EVENT(cm_send_rep_err, > + TP_PROTO( > + __be32 local_id, > + enum ib_cm_state state > + ), > + > + TP_ARGS(local_id, state), > + > + TP_STRUCT__entry( > + __field(unsigned int, local_id) > + __field(unsigned long, state) > + ), > + > + TP_fast_assign( > + __entry->local_id = be32_to_cpu(local_id); > + __entry->state = state; > + ), > + > + TP_printk("local_id=%u state=%s", > + __entry->local_id, show_ib_cm_state(__entry->state) > + ) > +); > + > +TRACE_EVENT(cm_rep_unknown_err, > + TP_PROTO( > + unsigned int local_id, > + unsigned int remote_id, > + enum ib_cm_state state > + ), > + > + TP_ARGS(local_id, remote_id, state), > + > + TP_STRUCT__entry( > + __field(unsigned int, local_id) > + __field(unsigned int, remote_id) > + __field(unsigned long, state) > + ), > + > + TP_fast_assign( > + __entry->local_id = local_id; > + __entry->remote_id = remote_id; > + __entry->state = state; > + ), > + > + TP_printk("local_id=%u remote_id=%u state=%s", > + __entry->local_id, __entry->remote_id, > + show_ib_cm_state(__entry->state) > + ) > +); > + > +TRACE_EVENT(cm_handler_err, > + TP_PROTO( > + enum ib_event_type event > + ), > + > + TP_ARGS(event), > + > + TP_STRUCT__entry( > + __field(unsigned long, event) > + ), > + > + TP_fast_assign( > + __entry->event = event; > + ), > + > + TP_printk("unhandled event=%s", > + rdma_show_ib_event(__entry->event) > + ) > +); > + > +TRACE_EVENT(cm_mad_send_err, > + TP_PROTO( > + enum ib_cm_state state, > + enum ib_wc_status wc_status > + ), > + > + TP_ARGS(state, wc_status), > + > + TP_STRUCT__entry( > + __field(unsigned long, state) > + __field(unsigned long, wc_status) > + ), > + > + TP_fast_assign( > + __entry->state = state; > + __entry->wc_status = wc_status; > + ), > + > + TP_printk("state=%s completion status=%s", > + show_ib_cm_state(__entry->state), > + rdma_show_wc_status(__entry->wc_status) > + ) > +); > + > +#endif /* _TRACE_IB_CMA_H */ > + > +#undef TRACE_INCLUDE_PATH > +#define TRACE_INCLUDE_PATH . > +#define TRACE_INCLUDE_FILE cm_trace > + > +#include <trace/define_trace.h> > -- Chuck Lever
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 24cb71a16a28..ccf2670ef45e 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -17,7 +17,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o -ib_cm-y := cm.o +ib_cm-y := cm.o cm_trace.o iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 0d1377232933..8dd8039e1a02 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -27,6 +27,7 @@ #include <rdma/ib_cm.h> #include "cm_msgs.h" #include "core_priv.h" +#include "cm_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); @@ -2124,8 +2125,7 @@ static int cm_req_handler(struct cm_work *work) listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { - pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id)); + trace_cm_no_listener_err(&cm_id_priv->id); cm_id_priv->id.state = IB_CM_IDLE; ret = -EINVAL; goto destroy; @@ -2274,8 +2274,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { - pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); + trace_cm_send_rep_err(cm_id_priv->id.local_id, cm_id->state); ret = -EINVAL; goto out; } @@ -2348,8 +2347,7 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { - pr_debug("%s: local_id %d, cm_id->state %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + trace_cm_send_cm_rtu_err(cm_id); ret = -EINVAL; goto error; } @@ -2465,7 +2463,7 @@ static int cm_rep_handler(struct cm_work *work) cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); if (!cm_id_priv) { cm_dup_rep_handler(work); - pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, + trace_cm_remote_no_priv_err( IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); return -EINVAL; } @@ -2479,11 +2477,10 @@ static int cm_rep_handler(struct cm_work *work) break; default: ret = -EINVAL; - pr_debug( - "%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n", - __func__, cm_id_priv->id.state, + trace_cm_rep_unknown_err( IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), - IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg), + cm_id_priv->id.state); spin_unlock_irq(&cm_id_priv->lock); goto error; } @@ -2500,7 +2497,7 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("%s: Failed to insert remote id %d\n", __func__, + trace_cm_insert_failed_err( IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); goto error; } @@ -2517,9 +2514,8 @@ static int cm_rep_handler(struct cm_work *work) IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; - pr_debug( - "%s: Stale connection. local_comm_id %d, remote_comm_id %d\n", - __func__, IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + trace_cm_staleconn_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); if (cur_cm_id_priv) { @@ -2646,9 +2642,7 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, return -EINVAL; if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_dreq_skipped(&cm_id_priv->id); return -EINVAL; } @@ -2722,10 +2716,7 @@ static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, return -EINVAL; if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) { - pr_debug( - "%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_send_drep_err(&cm_id_priv->id); kfree(private_data); return -EINVAL; } @@ -2810,9 +2801,8 @@ static int cm_dreq_handler(struct cm_work *work) atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); - pr_debug( - "%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n", - __func__, IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + trace_cm_no_priv_err( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); return -EINVAL; } @@ -2858,9 +2848,7 @@ static int cm_dreq_handler(struct cm_work *work) counter[CM_DREQ_COUNTER]); goto unlock; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_dreq_unknown_err(&cm_id_priv->id); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; @@ -2945,9 +2933,7 @@ static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, state); break; default: - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_send_unknown_rej_err(&cm_id_priv->id); return -EINVAL; } @@ -3060,9 +3046,7 @@ static int cm_rej_handler(struct cm_work *work) } /* fall through */ default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_rej_unknown_err(&cm_id_priv->id); spin_unlock_irq(&cm_id_priv->lock); goto out; } @@ -3118,9 +3102,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, } /* fall through */ default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_send_mra_unknown_err(&cm_id_priv->id); ret = -EINVAL; goto error1; } @@ -3229,9 +3211,7 @@ static int cm_mra_handler(struct cm_work *work) counter[CM_MRA_COUNTER]); /* fall through */ default: - pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_mra_unknown_err(&cm_id_priv->id); goto out; } @@ -3765,8 +3745,7 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg, if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) goto discard; - pr_debug_ratelimited("CM: failed sending MAD in state %d. (%s)\n", - state, ib_wc_status_msg(wc_status)); + trace_cm_mad_send_err(state, wc_status); switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: @@ -3889,7 +3868,7 @@ static void cm_work_handler(struct work_struct *_work) ret = cm_timewait_handler(work); break; default: - pr_debug("cm_event.event: 0x%x\n", work->cm_event.event); + trace_cm_handler_err(work->cm_event.event); ret = -EINVAL; break; } @@ -3925,8 +3904,7 @@ static int cm_establish(struct ib_cm_id *cm_id) ret = -EISCONN; break; default: - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + trace_cm_establish_err(cm_id); ret = -EINVAL; break; } @@ -4123,9 +4101,7 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_qp_init_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4173,9 +4149,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_qp_rtr_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4235,9 +4209,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_cm_qp_rts_err(&cm_id_priv->id); ret = -EINVAL; break; } diff --git a/drivers/infiniband/core/cm_trace.c b/drivers/infiniband/core/cm_trace.c new file mode 100644 index 000000000000..8f3482f66338 --- /dev/null +++ b/drivers/infiniband/core/cm_trace.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for the IB Connection Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#include <rdma/rdma_cm.h> +#include "cma_priv.h" + +#define CREATE_TRACE_POINTS + +#include "cm_trace.h" diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h new file mode 100644 index 000000000000..84f65f597e34 --- /dev/null +++ b/drivers/infiniband/core/cm_trace.h @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trace point definitions for the RDMA Connect Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020 Oracle and/or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ib_cma + +#if !defined(_TRACE_IB_CMA_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_IB_CMA_H + +#include <linux/tracepoint.h> +#include <rdma/ib_cm.h> +#include <trace/events/rdma.h> + +/* + * enum ib_cm_state, from include/rdma/ib_cm.h + */ +#define IB_CM_STATE_LIST \ + ib_cm_state(IDLE) \ + ib_cm_state(LISTEN) \ + ib_cm_state(REQ_SENT) \ + ib_cm_state(REQ_RCVD) \ + ib_cm_state(MRA_REQ_SENT) \ + ib_cm_state(MRA_REQ_RCVD) \ + ib_cm_state(REP_SENT) \ + ib_cm_state(REP_RCVD) \ + ib_cm_state(MRA_REP_SENT) \ + ib_cm_state(MRA_REP_RCVD) \ + ib_cm_state(ESTABLISHED) \ + ib_cm_state(DREQ_SENT) \ + ib_cm_state(DREQ_RCVD) \ + ib_cm_state(TIMEWAIT) \ + ib_cm_state(SIDR_REQ_SENT) \ + ib_cm_state_end(SIDR_REQ_RCVD) + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_STATE_LIST + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) { IB_CM_##x, #x }, +#define ib_cm_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_state(x) \ + __print_symbolic(x, IB_CM_STATE_LIST) + +/* + * enum ib_cm_lap_state, from include/rdma/ib_cm.h + */ +#define IB_CM_LAP_STATE_LIST \ + ib_cm_lap_state(LAP_UNINIT) \ + ib_cm_lap_state(LAP_IDLE) \ + ib_cm_lap_state(LAP_SENT) \ + ib_cm_lap_state(LAP_RCVD) \ + ib_cm_lap_state(MRA_LAP_SENT) \ + ib_cm_lap_state_end(MRA_LAP_RCVD) + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_lap_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_LAP_STATE_LIST + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) { IB_CM_##x, #x }, +#define ib_cm_lap_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_lap_state(x) \ + __print_symbolic(x, IB_CM_LAP_STATE_LIST) + + +DECLARE_EVENT_CLASS(cm_id_class, + TP_PROTO( + const struct ib_cm_id *cm_id + ), + + TP_ARGS(cm_id), + + TP_STRUCT__entry( + __field(const void *, cm_id) /* for eBPF scripts */ + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + __field(unsigned long, lap_state) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->lap_state = cm_id->lap_state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s lap_state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_lap_state(__entry->lap_state) + ) +); + +#define DEFINE_CM_ERR_EVENT(name) \ + DEFINE_EVENT(cm_id_class, \ + cm_##name##_err, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_ERR_EVENT(send_cm_rtu); +DEFINE_CM_ERR_EVENT(establish); +DEFINE_CM_ERR_EVENT(no_listener); +DEFINE_CM_ERR_EVENT(send_drep); +DEFINE_CM_ERR_EVENT(dreq_unknown); +DEFINE_CM_ERR_EVENT(send_unknown_rej); +DEFINE_CM_ERR_EVENT(rej_unknown); +DEFINE_CM_ERR_EVENT(send_mra_unknown); +DEFINE_CM_ERR_EVENT(mra_unknown); +DEFINE_CM_ERR_EVENT(qp_init); +DEFINE_CM_ERR_EVENT(qp_rtr); +DEFINE_CM_ERR_EVENT(qp_rts); + +DEFINE_EVENT(cm_id_class, \ + cm_dreq_skipped, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id) \ +); + +DECLARE_EVENT_CLASS(cm_local_class, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id + ), + + TP_ARGS(local_id, remote_id), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + ), + + TP_printk("local_id=%u remote_id=%u", + __entry->local_id, __entry->remote_id + ) +); + +#define DEFINE_CM_LOCAL_EVENT(name) \ + DEFINE_EVENT(cm_local_class, \ + cm_##name, \ + TP_PROTO( \ + unsigned int local_id, \ + unsigned int remote_id \ + ), \ + TP_ARGS(local_id, remote_id)) + +DEFINE_CM_LOCAL_EVENT(staleconn_err); +DEFINE_CM_LOCAL_EVENT(no_priv_err); + +DECLARE_EVENT_CLASS(cm_remote_class, + TP_PROTO( + u32 remote_id + ), + + TP_ARGS(remote_id), + + TP_STRUCT__entry( + __field(u32, remote_id) + ), + + TP_fast_assign( + __entry->remote_id = remote_id; + ), + + TP_printk("remote_id=%u", + __entry->remote_id + ) +); + +#define DEFINE_CM_REMOTE_EVENT(name) \ + DEFINE_EVENT(cm_remote_class, \ + cm_##name, \ + TP_PROTO( \ + u32 remote_id \ + ), \ + TP_ARGS(remote_id)) + +DEFINE_CM_REMOTE_EVENT(remote_no_priv_err); +DEFINE_CM_REMOTE_EVENT(insert_failed_err); + +TRACE_EVENT(cm_send_rep_err, + TP_PROTO( + __be32 local_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = be32_to_cpu(local_id); + __entry->state = state; + ), + + TP_printk("local_id=%u state=%s", + __entry->local_id, show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(cm_rep_unknown_err, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, remote_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + __entry->state = state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(cm_handler_err, + TP_PROTO( + enum ib_event_type event + ), + + TP_ARGS(event), + + TP_STRUCT__entry( + __field(unsigned long, event) + ), + + TP_fast_assign( + __entry->event = event; + ), + + TP_printk("unhandled event=%s", + rdma_show_ib_event(__entry->event) + ) +); + +TRACE_EVENT(cm_mad_send_err, + TP_PROTO( + enum ib_cm_state state, + enum ib_wc_status wc_status + ), + + TP_ARGS(state, wc_status), + + TP_STRUCT__entry( + __field(unsigned long, state) + __field(unsigned long, wc_status) + ), + + TP_fast_assign( + __entry->state = state; + __entry->wc_status = wc_status; + ), + + TP_printk("state=%s completion status=%s", + show_ib_cm_state(__entry->state), + rdma_show_wc_status(__entry->wc_status) + ) +); + +#endif /* _TRACE_IB_CMA_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE cm_trace + +#include <trace/define_trace.h>
In the interest of converging on a common instrumentation infrastructure, modernize the pr_debug() call sites added by commit 119bf81793ea ("IB/cm: Add debug prints to ib_cm"). The new tracepoints appear in a new "ib_cma" subsystem. The conversion is somewhat mechanical. Someone more familiar with the semantics of the recorded information might suggest additional data capture. Some benefits include: - Tracepoints enable "always on" reporting of these errors - The error records are structured and compact - Tracepoints provide hooks for eBPF scripts Sample output: nfsd-1954 [003] 62.017901: cm_send_dreq_err: local_id=1998890974 remote_id=1129750393 state=DREQ_RCVD lap_state=LAP_UNINIT Signed-off-by: Chuck Lever <chuck.lever@oracle.com> --- drivers/infiniband/core/Makefile | 2 drivers/infiniband/core/cm.c | 80 +++------ drivers/infiniband/core/cm_trace.c | 15 ++ drivers/infiniband/core/cm_trace.h | 309 ++++++++++++++++++++++++++++++++++++ 4 files changed, 351 insertions(+), 55 deletions(-) create mode 100644 drivers/infiniband/core/cm_trace.c create mode 100644 drivers/infiniband/core/cm_trace.h