diff mbox

IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing

Message ID 1452594732-9573-1-git-send-email-sagig@mellanox.com (mailing list archive)
State Changes Requested
Headers show

Commit Message

Sagi Grimberg Jan. 12, 2016, 10:32 a.m. UTC
mlx5 keeps a lot of internal accounting for wr processing.
mlx5_ib_wq consists of multiple arrays:
struct mlx5_ib_wq {
	u64		       *wrid;
	u32		       *wr_data;
	struct wr_list	       *w_list;
	unsigned	       *wqe_head;
	...
}

Each time we access each of these arrays, even for a single index
we fetch a cacheline. Reduce cacheline bounces by fitting these members
in a cacheline aligned struct (swr_ctx) and allocate an array. Accessing
this array will fetch all of these members in a single shot.

Since the receive queue needs only the wrid we use a nameless union
where in the rwr_ctx we only have wrid member.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c      | 18 +++++++--------
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 21 +++++++++++++----
 drivers/infiniband/hw/mlx5/qp.c      | 45 +++++++++++++++---------------------
 3 files changed, 44 insertions(+), 40 deletions(-)

Comments

Yann Droneaud Jan. 12, 2016, 2:37 p.m. UTC | #1
Hi,

Le mardi 12 janvier 2016 à 12:32 +0200, Sagi Grimberg a écrit :
> mlx5 keeps a lot of internal accounting for wr processing.
> mlx5_ib_wq consists of multiple arrays:
> struct mlx5_ib_wq {
> 	u64		       *wrid;
> 	u32		       *wr_data;
> 	struct wr_list	       *w_list;
> 	unsigned	       *wqe_head;
> 	...
> }
> 
> Each time we access each of these arrays, even for a single index
> we fetch a cacheline. Reduce cacheline bounces by fitting these
> members
> in a cacheline aligned struct (swr_ctx) and allocate an array.
> Accessing
> this array will fetch all of these members in a single shot.
> 
> Since the receive queue needs only the wrid we use a nameless union
> where in the rwr_ctx we only have wrid member.
> 
> Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
> ---
>  drivers/infiniband/hw/mlx5/cq.c      | 18 +++++++--------
>  drivers/infiniband/hw/mlx5/mlx5_ib.h | 21 +++++++++++++----
>  drivers/infiniband/hw/mlx5/qp.c      | 45 +++++++++++++++-----------
> ----------
>  3 files changed, 44 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index d4b227126265..84cb8fc072a1 100644
> --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -129,11 +129,24 @@ struct wr_list {
>  	u16	next;
>  };
>  
> +/* Please don't let this exceed a single cacheline */

Don't add a comment, add a compile time assert:

BUILD_BUG_ON(sizeof(struct swr_ctx) <= L1_CACHE_BYTES);

> +struct swr_ctx {
> +	u64		wrid;
> +	u32		wr_data;
> +	struct wr_list	w_list;
> +	u32		wqe_head;
> +	u8		rsvd[12];
> +}__packed;
> +

Packing the structure might make some fields unaligned and, on some
architecture, unaligned access are likely unwelcomed.

What about

struct swr_ctx {
	u64		wrid;
	u32		wr_data;
	struct wr_list	w_list;
	u32		wqe_head;
} ____cacheline_aligned;


> +struct rwr_ctx {
> +	u64		       wrid;
> +}__packed;
> +
>  struct mlx5_ib_wq {
> -	u64		       *wrid;
> -	u32		       *wr_data;
> -	struct wr_list	       *w_list;
> -	unsigned	       *wqe_head;
> +	union {
> +		struct swr_ctx *swr_ctx;
> +		struct rwr_ctx *rwr_ctx;
> +	};
>  	u16		        unsig_count;

Check the structure layout is the one you expect with pahole.


diff --git a/drivers/infiniband/hw/mlx5/qp.c
> b/drivers/infiniband/hw/mlx5/qp.c
> index 1ea049ed87da..a6b88902d7af 100644
> --- a/drivers/infiniband/hw/mlx5/qp.c
> +++ b/drivers/infiniband/hw/mlx5/qp.c
> @@ -794,14 +794,11 @@ static int create_kernel_qp(struct mlx5_ib_dev
> *dev,
>  		goto err_free;
>  	}
>  
> -	qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL);
> -	qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL);
> -	qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL);
> -	qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL);
> -	qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL);
> -
> -	if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
> -	    !qp->sq.w_list || !qp->sq.wqe_head) {
> +	qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx),
> +				 GFP_KERNEL);
> +	qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->sq.rwr_ctx),
> +				 GFP_KERNEL);


Anyway, I'm not sure about the alignment of the memory returned by
kcalloc(), I should have known by the time but don't find time to
figure how it's handled on various SL*B allocator implementation.

Regards.

-- 
Yann Droneaud
OPTEYA


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Jan. 13, 2016, 8:44 a.m. UTC | #2
> Hi,

Hi Yann,

>>
>> +/* Please don't let this exceed a single cacheline */
>
> Don't add a comment, add a compile time assert:
>
> BUILD_BUG_ON(sizeof(struct swr_ctx) <= L1_CACHE_BYTES);

Sure, I'll do that.

>> +struct swr_ctx {
>> +	u64		wrid;
>> +	u32		wr_data;
>> +	struct wr_list	w_list;
>> +	u32		wqe_head;
>> +	u8		rsvd[12];
>> +}__packed;
>> +
>
> Packing the structure might make some fields unaligned and, on some
> architecture, unaligned access are likely unwelcomed.

I manually align, so I can remove the packing...

> What about
>
> struct swr_ctx {
> 	u64		wrid;
> 	u32		wr_data;
> 	struct wr_list	w_list;
> 	u32		wqe_head;
> } ____cacheline_aligned;

The reason why I didn't use cacheline_aligned is that in 64B cacheline
architectures I'll get padding to 64B while I can only padd to 32B. With
that I get the benefit of fetching 2 entries which is sorta like an
implicit prefetch (useful for the next post_send).

Do you know any annotation that can do this for me?

>> +struct rwr_ctx {
>> +	u64		       wrid;
>> +}__packed;
>> +
>>   struct mlx5_ib_wq {
>> -	u64		       *wrid;
>> -	u32		       *wr_data;
>> -	struct wr_list	       *w_list;
>> -	unsigned	       *wqe_head;
>> +	union {
>> +		struct swr_ctx *swr_ctx;
>> +		struct rwr_ctx *rwr_ctx;
>> +	};
>>   	u16		        unsig_count;
>
> Check the structure layout is the one you expect with pahole.

I did, given that I simply removed 3 pointers I get the same layout
(2B hole after unsig_count). The nice side effect of this is that
mlx5_ib_wq is now smaller than a 64B cacheline (reduced from 80B to
56B).

>> +	qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx),
>> +				 GFP_KERNEL);
>> +	qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->sq.rwr_ctx),
>> +				 GFP_KERNEL);
>>
>
> Anyway, I'm not sure about the alignment of the memory returned by
> kcalloc(), I should have known by the time but don't find time to
> figure how it's handled on various SL*B allocator implementation.

Can you explain why should I worry about the alignment of kcalloc
here?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yann Droneaud Jan. 13, 2016, 9:26 a.m. UTC | #3
Le mercredi 13 janvier 2016 à 10:44 +0200, Sagi Grimberg a écrit :
> > Anyway, I'm not sure about the alignment of the memory returned by
> > kcalloc(), I should have known by the time but don't find time to
> > figure how it's handled on various SL*B allocator implementation.
> 
> Can you explain why should I worry about the alignment of kcalloc
> here?

Say it returns memory aligned on 16 bytes, then accessing some items in
the array is going to touch 2 cachelines, defeating the purpose of the
padding or the alignment attribute added to the item structure.

Regards.

-- 
Yann Droneaud
OPTEYA

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Jan. 13, 2016, 9:37 a.m. UTC | #4
>>> Anyway, I'm not sure about the alignment of the memory returned by
>>> kcalloc(), I should have known by the time but don't find time to
>>> figure how it's handled on various SL*B allocator implementation.
>>
>> Can you explain why should I worry about the alignment of kcalloc
>> here?
>
> Say it returns memory aligned on 16 bytes, then accessing some items in
> the array is going to touch 2 cachelines, defeating the purpose of the
> padding or the alignment attribute added to the item structure.

I see, I think the alignment is ARCH_KMALLOC_MINALIGN. I'm not aware
of aligned array allocation. Do you have a suggestion how can I align
the allocation? I can manually pad and align but it would mean I'd need
to save two extra pointers (aligned pointer and orig pointer for kfree)
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index b14316603e44..5eb0fcac72b1 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -102,7 +102,7 @@  static void *next_cqe_sw(struct mlx5_ib_cq *cq)
 
 static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
 {
-	switch (wq->wr_data[idx]) {
+	switch (wq->swr_ctx[idx].wr_data) {
 	case MLX5_IB_WR_UMR:
 		return 0;
 
@@ -194,7 +194,7 @@  static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
 		}
 	} else {
 		wq	  = &qp->rq;
-		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid;
 		++wq->tail;
 	}
 	wc->byte_len = be32_to_cpu(cqe->byte_cnt);
@@ -378,9 +378,9 @@  static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
 		if (idx == head)
 			break;
 
-		tail = qp->sq.w_list[idx].next;
+		tail = qp->sq.swr_ctx[idx].w_list.next;
 	} while (1);
-	tail = qp->sq.w_list[idx].next;
+	tail = qp->sq.swr_ctx[idx].w_list.next;
 	qp->sq.last_poll = tail;
 }
 
@@ -490,8 +490,8 @@  repoll:
 		idx = wqe_ctr & (wq->wqe_cnt - 1);
 		handle_good_req(wc, cqe64, wq, idx);
 		handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
-		wc->wr_id = wq->wrid[idx];
-		wq->tail = wq->wqe_head[idx] + 1;
+		wc->wr_id = wq->swr_ctx[idx].wrid;
+		wq->tail = wq->swr_ctx[idx].wqe_head + 1;
 		wc->status = IB_WC_SUCCESS;
 		break;
 	case MLX5_CQE_RESP_WR_IMM:
@@ -516,8 +516,8 @@  repoll:
 			wq = &(*cur_qp)->sq;
 			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
 			idx = wqe_ctr & (wq->wqe_cnt - 1);
-			wc->wr_id = wq->wrid[idx];
-			wq->tail = wq->wqe_head[idx] + 1;
+			wc->wr_id = wq->swr_ctx[idx].wrid;
+			wq->tail = wq->swr_ctx[idx].wqe_head + 1;
 		} else {
 			struct mlx5_ib_srq *srq;
 
@@ -528,7 +528,7 @@  repoll:
 				mlx5_ib_free_srq_wqe(srq, wqe_ctr);
 			} else {
 				wq = &(*cur_qp)->rq;
-				wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+				wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid;
 				++wq->tail;
 			}
 		}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d4b227126265..84cb8fc072a1 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -129,11 +129,24 @@  struct wr_list {
 	u16	next;
 };
 
+/* Please don't let this exceed a single cacheline */
+struct swr_ctx {
+	u64		wrid;
+	u32		wr_data;
+	struct wr_list	w_list;
+	u32		wqe_head;
+	u8		rsvd[12];
+}__packed;
+
+struct rwr_ctx {
+	u64		       wrid;
+}__packed;
+
 struct mlx5_ib_wq {
-	u64		       *wrid;
-	u32		       *wr_data;
-	struct wr_list	       *w_list;
-	unsigned	       *wqe_head;
+	union {
+		struct swr_ctx *swr_ctx;
+		struct rwr_ctx *rwr_ctx;
+	};
 	u16		        unsig_count;
 
 	/* serialize post to the work queue
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 1ea049ed87da..a6b88902d7af 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -794,14 +794,11 @@  static int create_kernel_qp(struct mlx5_ib_dev *dev,
 		goto err_free;
 	}
 
-	qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL);
-	qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL);
-	qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL);
-	qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL);
-	qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL);
-
-	if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
-	    !qp->sq.w_list || !qp->sq.wqe_head) {
+	qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx),
+				 GFP_KERNEL);
+	qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->sq.rwr_ctx),
+				 GFP_KERNEL);
+	if (!qp->sq.swr_ctx || !qp->rq.rwr_ctx) {
 		err = -ENOMEM;
 		goto err_wrid;
 	}
@@ -811,11 +808,8 @@  static int create_kernel_qp(struct mlx5_ib_dev *dev,
 
 err_wrid:
 	mlx5_db_free(dev->mdev, &qp->db);
-	kfree(qp->sq.wqe_head);
-	kfree(qp->sq.w_list);
-	kfree(qp->sq.wrid);
-	kfree(qp->sq.wr_data);
-	kfree(qp->rq.wrid);
+	kfree(qp->sq.swr_ctx);
+	kfree(qp->rq.rwr_ctx);
 
 err_free:
 	kvfree(*in);
@@ -831,11 +825,8 @@  err_uuar:
 static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 {
 	mlx5_db_free(dev->mdev, &qp->db);
-	kfree(qp->sq.wqe_head);
-	kfree(qp->sq.w_list);
-	kfree(qp->sq.wrid);
-	kfree(qp->sq.wr_data);
-	kfree(qp->rq.wrid);
+	kfree(qp->sq.swr_ctx);
+	kfree(qp->rq.rwr_ctx);
 	mlx5_buf_free(dev->mdev, &qp->buf);
 	free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn);
 }
@@ -2623,11 +2614,11 @@  static void finish_wqe(struct mlx5_ib_qp *qp,
 	if (unlikely(qp->wq_sig))
 		ctrl->signature = wq_sig(ctrl);
 
-	qp->sq.wrid[idx] = wr_id;
-	qp->sq.w_list[idx].opcode = mlx5_opcode;
-	qp->sq.wqe_head[idx] = qp->sq.head + nreq;
+	qp->sq.swr_ctx[idx].wrid = wr_id;
+	qp->sq.swr_ctx[idx].w_list.opcode = mlx5_opcode;
+	qp->sq.swr_ctx[idx].wqe_head = qp->sq.head + nreq;
 	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
-	qp->sq.w_list[idx].next = qp->sq.cur_post;
+	qp->sq.swr_ctx[idx].w_list.next = qp->sq.cur_post;
 }
 
 
@@ -2708,7 +2699,7 @@  int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 			case IB_WR_LOCAL_INV:
 				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
-				qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
+				qp->sq.swr_ctx[idx].wr_data = IB_WR_LOCAL_INV;
 				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
 				set_linv_wr(qp, &seg, &size);
 				num_sge = 0;
@@ -2716,7 +2707,7 @@  int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 			case IB_WR_REG_MR:
 				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
-				qp->sq.wr_data[idx] = IB_WR_REG_MR;
+				qp->sq.swr_ctx[idx].wr_data = IB_WR_REG_MR;
 				ctrl->imm = cpu_to_be32(reg_wr(wr)->key);
 				err = set_reg_wr(qp, reg_wr(wr), &seg, &size);
 				if (err) {
@@ -2727,7 +2718,7 @@  int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				break;
 
 			case IB_WR_REG_SIG_MR:
-				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
+				qp->sq.swr_ctx[idx].wr_data = IB_WR_REG_SIG_MR;
 				mr = to_mmr(sig_handover_wr(wr)->sig_mr);
 
 				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
@@ -2829,7 +2820,7 @@  int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				mlx5_ib_warn(dev, "bad opcode\n");
 				goto out;
 			}
-			qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
+			qp->sq.swr_ctx[idx].wr_data = MLX5_IB_WR_UMR;
 			ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey);
 			set_reg_umr_segment(seg, wr);
 			seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
@@ -2977,7 +2968,7 @@  int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 			set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
 		}
 
-		qp->rq.wrid[ind] = wr->wr_id;
+		qp->rq.rwr_ctx[ind].wrid = wr->wr_id;
 
 		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
 	}