diff mbox series

[v2,for-next,2/3] IB/{hfi1, rdmavt, qib}: Fit completions into single aligned cache-line

Message ID 20180910164915.15907.82884.stgit@scvm10.sc.intel.com (mailing list archive)
State Rejected
Delegated to: Jason Gunthorpe
Headers show
Series IB/hfi1: Perf updates and clean up | expand

Commit Message

Dennis Dalessandro Sept. 10, 2018, 4:49 p.m. UTC
From: Sebastian Sanchez <sebastian.sanchez@intel.com>

The struct ib_wc uses two cache-lines per completion, and it is
unaligned. This structure used to fit within one cacheline, but it was
expanded by fields added in the following patches:

dd5f03beb4f7 ("IB/core: Ethernet L2 attributes in verbs/cm structures")
c865f24628b9 ("IB/core: Add rdma_network_type to wc")

These new fields are only needed for ethernet and for HCAs that don't
provide the network type to search the proper GID in the GID table.
Since there are two cache-lines, more cache-lines are dirtied per work
completion entry.

Create a kernel only rvt_wc structure that is a single aligned
cache-line. This reduces the cache lines used per completion and
eliminates any cache line push-pull by aligning the size to a
cache-line.

Cache-aligning the new kernel completion queue expands struct rvt_cq_wc
breaking the ABI for the user completion queue. Therefore, decouple the
kernel completion queue from struct rvt_cq_wc to prevent this.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Kamenee Arumugam <kamenee.arumugam@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
---
 drivers/infiniband/hw/hfi1/rc.c         |    2 +
 drivers/infiniband/hw/hfi1/ruc.c        |    2 +
 drivers/infiniband/hw/hfi1/uc.c         |    2 +
 drivers/infiniband/hw/hfi1/ud.c         |    4 +-
 drivers/infiniband/hw/qib/qib_rc.c      |    2 +
 drivers/infiniband/hw/qib/qib_ruc.c     |    2 +
 drivers/infiniband/hw/qib/qib_uc.c      |    2 +
 drivers/infiniband/hw/qib/qib_ud.c      |    4 +-
 drivers/infiniband/sw/rdmavt/cq.c       |   57 ++++++++++++++++++++-----------
 drivers/infiniband/sw/rdmavt/qp.c       |    6 ++-
 drivers/infiniband/sw/rdmavt/trace_cq.h |    6 ++-
 include/rdma/rdmavt_cq.h                |   29 +++++++++++++++-
 include/rdma/rdmavt_qp.h                |    2 +
 13 files changed, 81 insertions(+), 39 deletions(-)

Comments

Jason Gunthorpe Sept. 11, 2018, 3:45 p.m. UTC | #1
On Mon, Sep 10, 2018 at 09:49:19AM -0700, Dennis Dalessandro wrote:
> From: Sebastian Sanchez <sebastian.sanchez@intel.com>
> 
> The struct ib_wc uses two cache-lines per completion, and it is
> unaligned. This structure used to fit within one cacheline, but it was
> expanded by fields added in the following patches:

Like Parav says, that statement seems to be nonsense:

struct ib_wc {
        union {
                u64                wr_id;                /*           8 */
                struct ib_cqe *    wr_cqe;               /*           8 */
        };                                               /*     0     8 */
        enum ib_wc_status          status;               /*     8     4 */
        enum ib_wc_opcode          opcode;               /*    12     4 */
        u32                        vendor_err;           /*    16     4 */
        u32                        byte_len;             /*    20     4 */
        struct ib_qp *             qp;                   /*    24     8 */
        union {
                __be32             imm_data;             /*           4 */
                u32                invalidate_rkey;      /*           4 */
        } ex;                                            /*    32     4 */
        u32                        src_qp;               /*    36     4 */
        u32                        slid;                 /*    40     4 */
        int                        wc_flags;             /*    44     4 */
        u16                        pkey_index;           /*    48     2 */
        u8                         sl;                   /*    50     1 */
        u8                         dlid_path_bits;       /*    51     1 */
        u8                         port_num;             /*    52     1 */
        u8                         smac[6];              /*    53     6 */

        /* XXX 1 byte hole, try to pack */

        u16                        vlan_id;              /*    60     2 */
        u8                         network_hdr_type;     /*    62     1 */

        /* size: 64, cachelines: 1, members: 17 */
        /* sum members: 62, holes: 1, sum holes: 1 */
        /* padding: 1 */
};

> Create a kernel only rvt_wc structure that is a single aligned
> cache-line. This reduces the cache lines used per completion and
> eliminates any cache line push-pull by aligning the size to a
> cache-line.

Not at all sure this is even a good idea to cache align. Most of the
usages here are singletons on-stack and we can resonably expect the
stack to be hot in the cache. Wasting stack space sounds like a
performance negative..

So not taking this, resend with an accurate commit message and some
performance numbers to try again..

Jason
Dennis Dalessandro Sept. 18, 2018, 11:19 a.m. UTC | #2
On 9/11/2018 11:45 AM, Jason Gunthorpe wrote:
> So not taking this, resend with an accurate commit message and some
> performance numbers to try again..

We don't plan on pursuing this patch further. Please just drop, which I 
assume you already have.

-Denny
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 9bd63ab..975d91a 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -2041,7 +2041,7 @@  void hfi1_rc_rcv(struct hfi1_packet *packet)
 	u32 hdrsize = packet->hlen;
 	u32 psn = ib_bth_get_psn(packet->ohdr);
 	u32 pad = packet->pad;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u32 pmtu = qp->pmtu;
 	int diff;
 	struct ib_reth *reth;
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 5f56f3c..019713a 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -173,7 +173,7 @@  static void ruc_loopback(struct rvt_qp *sqp)
 	struct rvt_swqe *wqe;
 	struct rvt_sge *sge;
 	unsigned long flags;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u64 sdata;
 	atomic64_t *maddr;
 	enum ib_wc_status send_status;
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index e254dce..5b5e057 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -312,7 +312,7 @@  void hfi1_uc_rcv(struct hfi1_packet *packet)
 	u32 hdrsize = packet->hlen;
 	u32 psn;
 	u32 pad = packet->pad;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u32 pmtu = qp->pmtu;
 	struct ib_reth *reth;
 	int ret;
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 70d39fc..8a4785e 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -79,7 +79,7 @@  static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	unsigned long flags;
 	struct rvt_sge_state ssge;
 	struct rvt_sge *sge;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u32 length;
 	enum ib_qp_type sqptype, dqptype;
 
@@ -867,7 +867,7 @@  static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
 void hfi1_ud_rcv(struct hfi1_packet *packet)
 {
 	u32 hdrsize = packet->hlen;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u32 src_qp;
 	u16 pkey;
 	int mgmt_pkey_idx = -1;
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index f35fdeb..a121d2c 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -1744,7 +1744,7 @@  void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr,
 	u32 hdrsize;
 	u32 psn;
 	u32 pad;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u32 pmtu = qp->pmtu;
 	int diff;
 	struct ib_reth *reth;
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index f8a7de7..1230c37 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -191,7 +191,7 @@  static void qib_ruc_loopback(struct rvt_qp *sqp)
 	struct rvt_swqe *wqe;
 	struct rvt_sge *sge;
 	unsigned long flags;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u64 sdata;
 	atomic64_t *maddr;
 	enum ib_wc_status send_status;
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index 3e54bc1..fa7dd3a 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -242,7 +242,7 @@  void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
 	u32 hdrsize;
 	u32 psn;
 	u32 pad;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u32 pmtu = qp->pmtu;
 	struct ib_reth *reth;
 	int ret;
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index f8d029a..580ebc8 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -58,7 +58,7 @@  static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 	unsigned long flags;
 	struct rvt_sge_state ssge;
 	struct rvt_sge *sge;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u32 length;
 	enum ib_qp_type sqptype, dqptype;
 
@@ -434,7 +434,7 @@  void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
 	int opcode;
 	u32 hdrsize;
 	u32 pad;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	u32 qkey;
 	u32 src_qp;
 	u16 dlid;
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 6406a08..e729335 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -62,10 +62,10 @@ 
  *
  * This may be called with qp->s_lock held.
  */
-void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
+void rvt_cq_enter(struct rvt_cq *cq, struct rvt_wc *entry, bool solicited)
 {
 	struct ib_uverbs_wc *uqueue = NULL;
-	struct ib_wc *kqueue = NULL;
+	struct rvt_wc *kqueue = NULL;
 	struct rvt_cq_wc *u_wc = NULL;
 	struct rvt_k_cq_wc *k_wc = NULL;
 	unsigned long flags;
@@ -230,6 +230,8 @@  struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	 * numbers of entries.
 	 */
 	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+
 		sz = sizeof(struct ib_uverbs_wc) * (entries + 1);
 		sz += sizeof(*u_wc);
 		u_wc = vmalloc_user(sz);
@@ -237,23 +239,11 @@  struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 			ret = ERR_PTR(-ENOMEM);
 			goto bail_cq;
 		}
-	} else {
-		sz = sizeof(struct ib_wc) * (entries + 1);
-		sz += sizeof(*k_wc);
-		k_wc = vzalloc_node(sz, rdi->dparms.node);
-		if (!k_wc) {
-			ret = ERR_PTR(-ENOMEM);
-			goto bail_cq;
-		}
-	}
-
-	/*
-	 * Return the address of the WC as the offset to mmap.
-	 * See rvt_mmap() for details.
-	 */
-	if (udata && udata->outlen >= sizeof(__u64)) {
-		int err;
 
+		/*
+		 * Return the address of the WC as the offset to mmap.
+		 * See rvt_mmap() for details.
+		 */
 		cq->ip = rvt_create_mmap_info(rdi, sz, context, u_wc);
 		if (!cq->ip) {
 			ret = ERR_PTR(-ENOMEM);
@@ -266,6 +256,14 @@  struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 			ret = ERR_PTR(err);
 			goto bail_ip;
 		}
+	} else {
+		sz = sizeof(struct rvt_wc) * (entries + 1);
+		sz += sizeof(*k_wc);
+		k_wc = vzalloc_node(sz, rdi->dparms.node);
+		if (!k_wc) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_cq;
+		}
 	}
 
 	spin_lock_irq(&rdi->n_cqs_lock);
@@ -343,6 +341,7 @@  int rvt_destroy_cq(struct ib_cq *ibcq)
 		kref_put(&cq->ip->ref, rvt_release_mmap_info);
 	else
 		vfree(cq->queue);
+	vfree(cq->kqueue);
 	kfree(cq);
 
 	return 0;
@@ -418,7 +417,7 @@  int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 		if (!u_wc)
 			return -ENOMEM;
 	} else {
-		sz = sizeof(struct ib_wc) * (cqe + 1);
+		sz = sizeof(struct rvt_wc) * (cqe + 1);
 		sz += sizeof(*k_wc);
 		k_wc = vzalloc_node(sz, rdi->dparms.node);
 		if (!k_wc)
@@ -520,6 +519,24 @@  int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 	return ret;
 }
 
+static void copy_rvt_wc_to_ib_wc(struct ib_wc *ibwc, struct rvt_wc *rvtwc)
+{
+	ibwc->wr_id = rvtwc->wr_id;
+	ibwc->status = rvtwc->status;
+	ibwc->opcode = rvtwc->opcode;
+	ibwc->vendor_err = rvtwc->vendor_err;
+	ibwc->byte_len = rvtwc->byte_len;
+	ibwc->qp = rvtwc->qp;
+	ibwc->ex.invalidate_rkey = rvtwc->ex.invalidate_rkey;
+	ibwc->src_qp = rvtwc->src_qp;
+	ibwc->wc_flags = rvtwc->wc_flags;
+	ibwc->slid = rvtwc->slid;
+	ibwc->pkey_index = rvtwc->pkey_index;
+	ibwc->sl = rvtwc->sl;
+	ibwc->dlid_path_bits = rvtwc->dlid_path_bits;
+	ibwc->port_num = rvtwc->port_num;
+}
+
 /**
  * rvt_poll_cq - poll for work completion entries
  * @ibcq: the completion queue to poll
@@ -554,7 +571,7 @@  int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 			break;
 		/* The kernel doesn't need a RMB since it has the lock. */
 		trace_rvt_cq_poll(cq, &wc->kqueue[tail], npolled);
-		*entry = wc->kqueue[tail];
+		copy_rvt_wc_to_ib_wc(entry, &wc->kqueue[tail]);
 		if (tail >= cq->ibcq.cqe)
 			tail = 0;
 		else
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 5ce403c..e2cc827 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1049,7 +1049,7 @@  struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
  */
 int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
 {
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	int ret = 0;
 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
 
@@ -1573,7 +1573,7 @@  int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 			return -ENOMEM;
 		}
 		if (unlikely(qp_err_flush)) {
-			struct ib_wc wc;
+			struct rvt_wc wc;
 
 			memset(&wc, 0, sizeof(wc));
 			wc.qp = &qp->ibqp;
@@ -1996,7 +1996,7 @@  int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
 {
 	int i, j, ret;
-	struct ib_wc wc;
+	struct rvt_wc wc;
 	struct rvt_lkey_table *rkt;
 	struct rvt_pd *pd;
 	struct rvt_sge_state *ss;
diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h
index df8e1ad..832308d 100644
--- a/drivers/infiniband/sw/rdmavt/trace_cq.h
+++ b/drivers/infiniband/sw/rdmavt/trace_cq.h
@@ -109,7 +109,7 @@ 
 
 DECLARE_EVENT_CLASS(
 	rvt_cq_entry_template,
-	TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+	TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx),
 	TP_ARGS(cq, wc, idx),
 	TP_STRUCT__entry(
 		RDI_DEV_ENTRY(cq->rdi)
@@ -143,12 +143,12 @@ 
 
 DEFINE_EVENT(
 	rvt_cq_entry_template, rvt_cq_enter,
-	TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+	TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx),
 	TP_ARGS(cq, wc, idx));
 
 DEFINE_EVENT(
 	rvt_cq_entry_template, rvt_cq_poll,
-	TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+	TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx),
 	TP_ARGS(cq, wc, idx));
 
 #endif /* __RVT_TRACE_CQ_H */
diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h
index 9f25be0..a5d3e3c 100644
--- a/include/rdma/rdmavt_cq.h
+++ b/include/rdma/rdmavt_cq.h
@@ -62,6 +62,30 @@ 
 #include <rdma/rvt-abi.h>
 
 /*
+ * If any fields within struct rvt_wc change, the function
+ * copy_rvt_wc_to_ib_wc() should be updated.
+ */
+struct rvt_wc {
+	u64			wr_id;
+	enum ib_wc_status	status;
+	enum ib_wc_opcode	opcode;
+	u32			vendor_err;
+	u32			byte_len;
+	struct ib_qp	       *qp;
+	union {
+		__be32		imm_data;
+		u32		invalidate_rkey;
+	} ex;
+	u32			src_qp;
+	int			wc_flags;
+	u32			slid;
+	u16			pkey_index;
+	u8			sl;
+	u8			dlid_path_bits;
+	u8			port_num; /* valid only for DR SMPs onswitches*/
+} ____cacheline_aligned_in_smp;
+
+/*
  * This structure is used to contain the head pointer, tail pointer,
  * and completion queue entries as a single memory allocation so
  * it can be mmap'ed into user space.
@@ -69,7 +93,8 @@ 
 struct rvt_k_cq_wc {
 	u32 head;               /* index of next entry to fill */
 	u32 tail;               /* index of next ib_poll_cq() entry */
-	struct ib_wc kqueue[0];
+	/* this is actually size ibcq.cqe + 1 */
+	struct rvt_wc kqueue[0];
 };
 
 /*
@@ -93,6 +118,6 @@  struct rvt_cq {
 	return container_of(ibcq, struct rvt_cq, ibcq);
 }
 
-void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited);
+void rvt_cq_enter(struct rvt_cq *cq, struct rvt_wc *entry, bool solicited);
 
 #endif          /* DEF_RDMAVT_INCCQH */
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 927f6d5..125db26 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -589,7 +589,7 @@  static inline void rvt_qp_swqe_complete(
 	if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
 	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
 	     status != IB_WC_SUCCESS) {
-		struct ib_wc wc;
+		struct rvt_wc wc;
 
 		memset(&wc, 0, sizeof(wc));
 		wc.wr_id = wqe->wr.wr_id;