Message ID | 20180717073818.13442-2-bharat@chelsio.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Tue, Jul 17, 2018 at 01:08:17PM +0530, Potnuri Bharat Teja wrote: > Adds iw_cxgb4 functionality to support RDMA_WRITE_WITH_IMMEDATE opcode. > > Signed-off-by: Potnuri Bharat Teja <bharat@chelsio.com> > Signed-off-by: Steve Wise <swise@opengridcomputing.com> > drivers/infiniband/hw/cxgb4/cq.c | 23 ++++++++++++++++--- > drivers/infiniband/hw/cxgb4/qp.c | 37 ++++++++++++++++++++++++------- > drivers/infiniband/hw/cxgb4/t4.h | 16 ++++++++++++- > drivers/infiniband/hw/cxgb4/t4fw_ri_api.h | 18 ++++++++++++--- > include/uapi/rdma/cxgb4-abi.h | 3 ++- > 5 files changed, 81 insertions(+), 16 deletions(-) > > diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c > index 64f70800945b..5792675150e0 100644 > +++ b/drivers/infiniband/hw/cxgb4/cq.c > @@ -816,15 +816,32 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) > wc->byte_len = CQE_LEN(&cqe); > else > wc->byte_len = 0; > - wc->opcode = IB_WC_RECV; > - if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV || > - CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { > + > + switch (CQE_OPCODE(&cqe)) { > + case FW_RI_SEND: > + wc->opcode = IB_WC_RECV; > + break; > + case FW_RI_SEND_WITH_INV: > + case FW_RI_SEND_WITH_SE_INV: > + wc->opcode = IB_WC_RECV; > wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); > wc->wc_flags |= IB_WC_WITH_INVALIDATE; > c4iw_invalidate_mr(qhp->rhp, wc->ex.invalidate_rkey); > + break; > + case FW_RI_WRITE_IMMEDIATE: > + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; > + wc->ex.imm_data = CQE_IMM_DATA(&cqe); > + wc->wc_flags |= IB_WC_WITH_IMM; > + break; > + default: > + pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n", > + CQE_OPCODE(&cqe), CQE_QPID(&cqe)); > + ret = -EINVAL; > + goto out; > } > } else { > switch (CQE_OPCODE(&cqe)) { > + case FW_RI_WRITE_IMMEDIATE: > case FW_RI_RDMA_WRITE: > wc->opcode = IB_WC_RDMA_WRITE; > break; > diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c > index 08dc555942af..88bd67d7dc77 100644 > +++ b/drivers/infiniband/hw/cxgb4/qp.c > @@ -555,7 +555,15 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, > > if (wr->num_sge > T4_MAX_SEND_SGE) > return -EINVAL; > - wqe->write.r2 = 0; > + > + /* > + * iWARP protocol supports 64 bit immediate data but rdma api > + * limits it to 32bit. > + */ > + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) > + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = wr->ex.imm_data; > + else > + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = 0; > wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); > wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); > if (wr->num_sge) { > @@ -846,6 +854,9 @@ static int ib_to_fw_opcode(int ib_opcode) > case IB_WR_RDMA_WRITE: > opcode = FW_RI_RDMA_WRITE; > break; > + case IB_WR_RDMA_WRITE_WITH_IMM: > + opcode = FW_RI_WRITE_IMMEDIATE; > + break; > case IB_WR_RDMA_READ: > case IB_WR_RDMA_READ_WITH_INV: > opcode = FW_RI_READ_REQ; > @@ -964,6 +975,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > enum fw_wr_opcodes fw_opcode = 0; > enum fw_ri_wr_flags fw_flags; > struct c4iw_qp *qhp; > + struct c4iw_dev *rhp; > union t4_wr *wqe = NULL; > u32 num_wrs; > struct t4_swsqe *swsqe; > @@ -971,6 +983,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > u16 idx = 0; > > qhp = to_c4iw_qp(ibqp); > + rhp = qhp->rhp; > spin_lock_irqsave(&qhp->lock, flag); > > /* > @@ -1015,6 +1028,13 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > swsqe->opcode = FW_RI_SEND_WITH_INV; > err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16); > break; > + case IB_WR_RDMA_WRITE_WITH_IMM: > + if (unlikely(!rhp->rdev.lldi.write_w_imm_support)) { > + err = -EINVAL; > + break; > + } > + fw_flags |= FW_RI_RDMA_WRITE_WITH_IMMEDIATE; > + /*FALLTHROUGH*/ > case IB_WR_RDMA_WRITE: > fw_opcode = FW_RI_RDMA_WRITE_WR; > swsqe->opcode = FW_RI_RDMA_WRITE; > @@ -1025,8 +1045,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > fw_opcode = FW_RI_RDMA_READ_WR; > swsqe->opcode = FW_RI_READ_REQ; > if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) { > - c4iw_invalidate_mr(qhp->rhp, > - wr->sg_list[0].lkey); > + c4iw_invalidate_mr(rhp, wr->sg_list[0].lkey); > fw_flags = FW_RI_RDMA_READ_INVALIDATE; > } else { > fw_flags = 0; > @@ -1042,7 +1061,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr); > > swsqe->opcode = FW_RI_FAST_REGISTER; > - if (qhp->rhp->rdev.lldi.fr_nsmr_tpte_wr_support && > + if (rhp->rdev.lldi.fr_nsmr_tpte_wr_support && > !mhp->attr.state && mhp->mpl_len <= 2) { > fw_opcode = FW_RI_FR_NSMR_TPTE_WR; > build_tpte_memreg(&wqe->fr_tpte, reg_wr(wr), > @@ -1051,7 +1070,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > fw_opcode = FW_RI_FR_NSMR_WR; > err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr), > mhp, &len16, > - qhp->rhp->rdev.lldi.ulptx_memwrite_dsgl); > + rhp->rdev.lldi.ulptx_memwrite_dsgl); > if (err) > break; > } > @@ -1064,7 +1083,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > fw_opcode = FW_RI_INV_LSTAG_WR; > swsqe->opcode = FW_RI_LOCAL_INV; > err = build_inv_stag(wqe, wr, &len16); > - c4iw_invalidate_mr(qhp->rhp, wr->ex.invalidate_rkey); > + c4iw_invalidate_mr(rhp, wr->ex.invalidate_rkey); > break; > default: > pr_warn("%s post of type=%d TBD!\n", __func__, > @@ -1083,7 +1102,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > swsqe->wr_id = wr->wr_id; > if (c4iw_wr_log) { > swsqe->sge_ts = cxgb4_read_sge_timestamp( > - qhp->rhp->rdev.lldi.ports[0]); > + rhp->rdev.lldi.ports[0]); > swsqe->host_time = ktime_get(); > } > > @@ -1097,7 +1116,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, > t4_sq_produce(&qhp->wq, len16); > idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); > } > - if (!qhp->rhp->rdev.status_page->db_off) { > + if (!rhp->rdev.status_page->db_off) { > t4_ring_sq_db(&qhp->wq, idx, wqe); > spin_unlock_irqrestore(&qhp->lock, flag); > } else { > @@ -2092,6 +2111,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, > uresp.flags = C4IW_QPF_ONCHIP; > } else > uresp.flags = 0; > + if (rhp->rdev.lldi.write_w_imm_support) > + uresp.flags |= C4IW_QPF_WRITE_W_IMM; > uresp.qid_mask = rhp->rdev.qpmask; > uresp.sqid = qhp->wq.sq.qid; > uresp.sq_size = qhp->wq.sq.size; > diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h > index 11d55fc2ded7..3fdcc445ff16 100644 > +++ b/drivers/infiniband/hw/cxgb4/t4.h > @@ -190,7 +190,19 @@ struct t4_cqe { > __be32 abs_rqe_idx; > } srcqe; > struct { > - __be64 imm_data; > + __be32 mo; > + __be32 msn; > + /* > + * Use union for immediate data to be consistent with > + * stack's 32 bit data and iWARP spec's 64 bit data. > + */ > + union { > + struct { > + u32 reserved; > + __be32 imm_data32; > + } ib_imm_data; > + __be64 imm_data64; You guys should think carefully about this as this choice to put the 32 bit version at the end becomes a permanent wire protocol ABI for iWarp. The definition of 'imm data' is a memory image of what is put into the packet, so placing the 32 bit version of this at the end of the array on the wire seems like a strange choice. If we extend verbs, it will continue to be a memory image semantic, and I would expect the 32 bits version to be the upper chunk of the array, not the lower chunk. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hey Jason, > -----Original Message----- > From: linux-rdma-owner@vger.kernel.org <linux-rdma- > owner@vger.kernel.org> On Behalf Of Jason Gunthorpe > Sent: Thursday, July 26, 2018 12:21 PM > To: Potnuri Bharat Teja <bharat@chelsio.com> > Cc: dledford@redhat.com; swise@opengridcomputing.com; > rajur@chelsio.com; linux-rdma@vger.kernel.org > Subject: Re: [PATCH for-next v2 1/2] iw_cxgb4: RDMA write with immediate > support > > On Tue, Jul 17, 2018 at 01:08:17PM +0530, Potnuri Bharat Teja wrote: > > Adds iw_cxgb4 functionality to support RDMA_WRITE_WITH_IMMEDATE > opcode. > > > > Signed-off-by: Potnuri Bharat Teja <bharat@chelsio.com> > > Signed-off-by: Steve Wise <swise@opengridcomputing.com> > > drivers/infiniband/hw/cxgb4/cq.c | 23 ++++++++++++++++--- > > drivers/infiniband/hw/cxgb4/qp.c | 37 ++++++++++++++++++++++++-- > ----- > > drivers/infiniband/hw/cxgb4/t4.h | 16 ++++++++++++- > > drivers/infiniband/hw/cxgb4/t4fw_ri_api.h | 18 ++++++++++++--- > > include/uapi/rdma/cxgb4-abi.h | 3 ++- > > 5 files changed, 81 insertions(+), 16 deletions(-) > > > > diff --git a/drivers/infiniband/hw/cxgb4/cq.c > b/drivers/infiniband/hw/cxgb4/cq.c > > index 64f70800945b..5792675150e0 100644 > > +++ b/drivers/infiniband/hw/cxgb4/cq.c > > @@ -816,15 +816,32 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, > struct ib_wc *wc) > > wc->byte_len = CQE_LEN(&cqe); > > else > > wc->byte_len = 0; > > - wc->opcode = IB_WC_RECV; > > - if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV || > > - CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { > > + > > + switch (CQE_OPCODE(&cqe)) { > > + case FW_RI_SEND: > > + wc->opcode = IB_WC_RECV; > > + break; > > + case FW_RI_SEND_WITH_INV: > > + case FW_RI_SEND_WITH_SE_INV: > > + wc->opcode = IB_WC_RECV; > > wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); > > wc->wc_flags |= IB_WC_WITH_INVALIDATE; > > c4iw_invalidate_mr(qhp->rhp, wc- > >ex.invalidate_rkey); > > + break; > > + case FW_RI_WRITE_IMMEDIATE: > > + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; > > + wc->ex.imm_data = CQE_IMM_DATA(&cqe); > > + wc->wc_flags |= IB_WC_WITH_IMM; > > + break; > > + default: > > + pr_err("Unexpected opcode %d in the CQE received > for QPID=0x%0x\n", > > + CQE_OPCODE(&cqe), CQE_QPID(&cqe)); > > + ret = -EINVAL; > > + goto out; > > } > > } else { > > switch (CQE_OPCODE(&cqe)) { > > + case FW_RI_WRITE_IMMEDIATE: > > case FW_RI_RDMA_WRITE: > > wc->opcode = IB_WC_RDMA_WRITE; > > break; > > diff --git a/drivers/infiniband/hw/cxgb4/qp.c > b/drivers/infiniband/hw/cxgb4/qp.c > > index 08dc555942af..88bd67d7dc77 100644 > > +++ b/drivers/infiniband/hw/cxgb4/qp.c > > @@ -555,7 +555,15 @@ static int build_rdma_write(struct t4_sq *sq, > union t4_wr *wqe, > > > > if (wr->num_sge > T4_MAX_SEND_SGE) > > return -EINVAL; > > - wqe->write.r2 = 0; > > + > > + /* > > + * iWARP protocol supports 64 bit immediate data but rdma api > > + * limits it to 32bit. > > + */ > > + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) > > + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = wr- > >ex.imm_data; > > + else > > + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = 0; > > wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); > > wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); > > if (wr->num_sge) { > > @@ -846,6 +854,9 @@ static int ib_to_fw_opcode(int ib_opcode) > > case IB_WR_RDMA_WRITE: > > opcode = FW_RI_RDMA_WRITE; > > break; > > + case IB_WR_RDMA_WRITE_WITH_IMM: > > + opcode = FW_RI_WRITE_IMMEDIATE; > > + break; > > case IB_WR_RDMA_READ: > > case IB_WR_RDMA_READ_WITH_INV: > > opcode = FW_RI_READ_REQ; > > @@ -964,6 +975,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > enum fw_wr_opcodes fw_opcode = 0; > > enum fw_ri_wr_flags fw_flags; > > struct c4iw_qp *qhp; > > + struct c4iw_dev *rhp; > > union t4_wr *wqe = NULL; > > u32 num_wrs; > > struct t4_swsqe *swsqe; > > @@ -971,6 +983,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > u16 idx = 0; > > > > qhp = to_c4iw_qp(ibqp); > > + rhp = qhp->rhp; > > spin_lock_irqsave(&qhp->lock, flag); > > > > /* > > @@ -1015,6 +1028,13 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > swsqe->opcode = FW_RI_SEND_WITH_INV; > > err = build_rdma_send(&qhp->wq.sq, wqe, wr, > &len16); > > break; > > + case IB_WR_RDMA_WRITE_WITH_IMM: > > + if (unlikely(!rhp->rdev.lldi.write_w_imm_support)) { > > + err = -EINVAL; > > + break; > > + } > > + fw_flags |= > FW_RI_RDMA_WRITE_WITH_IMMEDIATE; > > + /*FALLTHROUGH*/ > > case IB_WR_RDMA_WRITE: > > fw_opcode = FW_RI_RDMA_WRITE_WR; > > swsqe->opcode = FW_RI_RDMA_WRITE; > > @@ -1025,8 +1045,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > fw_opcode = FW_RI_RDMA_READ_WR; > > swsqe->opcode = FW_RI_READ_REQ; > > if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) { > > - c4iw_invalidate_mr(qhp->rhp, > > - wr->sg_list[0].lkey); > > + c4iw_invalidate_mr(rhp, wr->sg_list[0].lkey); > > fw_flags = FW_RI_RDMA_READ_INVALIDATE; > > } else { > > fw_flags = 0; > > @@ -1042,7 +1061,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr); > > > > swsqe->opcode = FW_RI_FAST_REGISTER; > > - if (qhp->rhp->rdev.lldi.fr_nsmr_tpte_wr_support && > > + if (rhp->rdev.lldi.fr_nsmr_tpte_wr_support && > > !mhp->attr.state && mhp->mpl_len <= 2) { > > fw_opcode = FW_RI_FR_NSMR_TPTE_WR; > > build_tpte_memreg(&wqe->fr_tpte, > reg_wr(wr), > > @@ -1051,7 +1070,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > fw_opcode = FW_RI_FR_NSMR_WR; > > err = build_memreg(&qhp->wq.sq, wqe, > reg_wr(wr), > > mhp, &len16, > > - qhp->rhp- > >rdev.lldi.ulptx_memwrite_dsgl); > > + rhp->rdev.lldi.ulptx_memwrite_dsgl); > > if (err) > > break; > > } > > @@ -1064,7 +1083,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > fw_opcode = FW_RI_INV_LSTAG_WR; > > swsqe->opcode = FW_RI_LOCAL_INV; > > err = build_inv_stag(wqe, wr, &len16); > > - c4iw_invalidate_mr(qhp->rhp, wr- > >ex.invalidate_rkey); > > + c4iw_invalidate_mr(rhp, wr->ex.invalidate_rkey); > > break; > > default: > > pr_warn("%s post of type=%d TBD!\n", __func__, > > @@ -1083,7 +1102,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > swsqe->wr_id = wr->wr_id; > > if (c4iw_wr_log) { > > swsqe->sge_ts = cxgb4_read_sge_timestamp( > > - qhp->rhp->rdev.lldi.ports[0]); > > + rhp->rdev.lldi.ports[0]); > > swsqe->host_time = ktime_get(); > > } > > > > @@ -1097,7 +1116,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct > ib_send_wr *wr, > > t4_sq_produce(&qhp->wq, len16); > > idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); > > } > > - if (!qhp->rhp->rdev.status_page->db_off) { > > + if (!rhp->rdev.status_page->db_off) { > > t4_ring_sq_db(&qhp->wq, idx, wqe); > > spin_unlock_irqrestore(&qhp->lock, flag); > > } else { > > @@ -2092,6 +2111,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, > struct ib_qp_init_attr *attrs, > > uresp.flags = C4IW_QPF_ONCHIP; > > } else > > uresp.flags = 0; > > + if (rhp->rdev.lldi.write_w_imm_support) > > + uresp.flags |= C4IW_QPF_WRITE_W_IMM; > > uresp.qid_mask = rhp->rdev.qpmask; > > uresp.sqid = qhp->wq.sq.qid; > > uresp.sq_size = qhp->wq.sq.size; > > diff --git a/drivers/infiniband/hw/cxgb4/t4.h > b/drivers/infiniband/hw/cxgb4/t4.h > > index 11d55fc2ded7..3fdcc445ff16 100644 > > +++ b/drivers/infiniband/hw/cxgb4/t4.h > > @@ -190,7 +190,19 @@ struct t4_cqe { > > __be32 abs_rqe_idx; > > } srcqe; > > struct { > > - __be64 imm_data; > > + __be32 mo; > > + __be32 msn; > > + /* > > + * Use union for immediate data to be consistent with > > + * stack's 32 bit data and iWARP spec's 64 bit data. > > + */ > > + union { > > + struct { > > + u32 reserved; > > + __be32 imm_data32; > > + } ib_imm_data; > > + __be64 imm_data64; > > You guys should think carefully about this as this choice to put the > 32 bit version at the end becomes a permanent wire protocol ABI for > iWarp. > > The definition of 'imm data' is a memory image of what is put into the > packet, so placing the 32 bit version of this at the end of the array > on the wire seems like a strange choice. > > If we extend verbs, it will continue to be a memory image semantic, > and I would expect the 32 bits version to be the upper chunk of the > array, not the lower chunk. > Are you assuming an LE host? Which brings up a point that 'imm data' is not very platform independent. But I guess putting it in the upper chunk is fine. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 26, 2018 at 12:51:06PM -0500, Steve Wise wrote: > > > struct { > > > - __be64 imm_data; > > > + __be32 mo; > > > + __be32 msn; > > > + /* > > > + * Use union for immediate data to be consistent > with > > > + * stack's 32 bit data and iWARP spec's 64 bit data. > > > + */ > > > + union { > > > + struct { > > > + u32 reserved; > > > + __be32 imm_data32; > > > + } ib_imm_data; > > > + __be64 imm_data64; > > > > You guys should think carefully about this as this choice to put the > > 32 bit version at the end becomes a permanent wire protocol ABI for > > iWarp. > > > > The definition of 'imm data' is a memory image of what is put into the > > packet, so placing the 32 bit version of this at the end of the array > > on the wire seems like a strange choice. > > > > If we extend verbs, it will continue to be a memory image semantic, > > and I would expect the 32 bits version to be the upper chunk of the > > array, not the lower chunk. > > > > Are you assuming an LE host? endianness doesn't matter, the above always orders the imm_data_32 in the last 4 bytes of the imm_data. > Which brings up a point that 'imm data' is not very platform > independent. But I guess putting it in the upper chunk is fine. imm_data is defined as a memory image. byte 0 of imm_data in the work request goes into bytes 0 of the packet and comes out again at byte 0 in the work completion. This is fully platform independent. Most apps choose to put a BE value in here, but any other fixed-memory-order value would be fine. You should also make sure your byte ordering on the wire is right, as this becomes ABI forever. Recommend to follow IB with byte 0 as byte 0 in the packet. (I think, at least) Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: Jason Gunthorpe <jgg@ziepe.ca> > Sent: Thursday, July 26, 2018 2:04 PM > To: Steve Wise <swise@opengridcomputing.com> > Cc: 'Potnuri Bharat Teja' <bharat@chelsio.com>; dledford@redhat.com; > rajur@chelsio.com; linux-rdma@vger.kernel.org > Subject: Re: [PATCH for-next v2 1/2] iw_cxgb4: RDMA write with immediate > support > > On Thu, Jul 26, 2018 at 12:51:06PM -0500, Steve Wise wrote: > > > > struct { > > > > - __be64 imm_data; > > > > + __be32 mo; > > > > + __be32 msn; > > > > + /* > > > > + * Use union for immediate data to be consistent > > with > > > > + * stack's 32 bit data and iWARP spec's 64 bit data. > > > > + */ > > > > + union { > > > > + struct { > > > > + u32 reserved; > > > > + __be32 imm_data32; > > > > + } ib_imm_data; > > > > + __be64 imm_data64; > > > > > > You guys should think carefully about this as this choice to put the > > > 32 bit version at the end becomes a permanent wire protocol ABI for > > > iWarp. > > > > > > The definition of 'imm data' is a memory image of what is put into the > > > packet, so placing the 32 bit version of this at the end of the array > > > on the wire seems like a strange choice. > > > > > > If we extend verbs, it will continue to be a memory image semantic, > > > and I would expect the 32 bits version to be the upper chunk of the > > > array, not the lower chunk. > > > > > > > Are you assuming an LE host? > > endianness doesn't matter, the above always orders the imm_data_32 in > the last 4 bytes of the imm_data. > > > Which brings up a point that 'imm data' is not very platform > > independent. But I guess putting it in the upper chunk is fine. > > imm_data is defined as a memory image. byte 0 of imm_data in the work > request goes into bytes 0 of the packet and comes out again at byte 0 > in the work completion. > > This is fully platform independent. > > Most apps choose to put a BE value in here, but any other > fixed-memory-order value would be fine. My point was, the applications need to "know" what endianness their peer is, or both agree to put it in BE or LE. IE The Verbs API doesn't do this for them and convert from host BO to a standard protocol-defined byte order. > > You should also make sure your byte ordering on the wire is right, as > this becomes ABI forever. Recommend to follow IB with byte 0 as byte 0 > in the packet. (I think, at least) > > Jason Hey Bharat, please have a look at the RFC for this and see if they specify anything. Thanks! -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday, July 07/27/18, 2018 at 00:46:00 +0530, Steve Wise wrote: > > > > -----Original Message----- > > From: Jason Gunthorpe <jgg@ziepe.ca> > > Sent: Thursday, July 26, 2018 2:04 PM > > To: Steve Wise <swise@opengridcomputing.com> > > Cc: 'Potnuri Bharat Teja' <bharat@chelsio.com>; dledford@redhat.com; > > rajur@chelsio.com; linux-rdma@vger.kernel.org > > Subject: Re: [PATCH for-next v2 1/2] iw_cxgb4: RDMA write with immediate > > support > > > > On Thu, Jul 26, 2018 at 12:51:06PM -0500, Steve Wise wrote: > > > > > struct { > > > > > - __be64 imm_data; > > > > > + __be32 mo; > > > > > + __be32 msn; > > > > > + /* > > > > > + * Use union for immediate data to be > consistent > > > with > > > > > + * stack's 32 bit data and iWARP spec's 64 > bit data. > > > > > + */ > > > > > + union { > > > > > + struct { > > > > > + u32 reserved; > > > > > + __be32 imm_data32; > > > > > + } ib_imm_data; > > > > > + __be64 imm_data64; > > > > > > > > You guys should think carefully about this as this choice to put the > > > > 32 bit version at the end becomes a permanent wire protocol ABI for > > > > iWarp. > > > > > > > > The definition of 'imm data' is a memory image of what is put into the > > > > packet, so placing the 32 bit version of this at the end of the array > > > > on the wire seems like a strange choice. > > > > > > > > If we extend verbs, it will continue to be a memory image semantic, > > > > and I would expect the 32 bits version to be the upper chunk of the > > > > array, not the lower chunk. > > > > > > > > > > Are you assuming an LE host? > > > > endianness doesn't matter, the above always orders the imm_data_32 in > > the last 4 bytes of the imm_data. > > > > > Which brings up a point that 'imm data' is not very platform > > > independent. But I guess putting it in the upper chunk is fine. > > > > imm_data is defined as a memory image. byte 0 of imm_data in the work > > request goes into bytes 0 of the packet and comes out again at byte 0 > > in the work completion. > > > > This is fully platform independent. > > > > Most apps choose to put a BE value in here, but any other > > fixed-memory-order value would be fine. > > My point was, the applications need to "know" what endianness their peer is, > or both agree to put it in BE or LE. IE The Verbs API doesn't do this for > them and convert from host BO to a standard protocol-defined byte order. > > > > > You should also make sure your byte ordering on the wire is right, as > > this becomes ABI forever. Recommend to follow IB with byte 0 as byte 0 > > in the packet. (I think, at least) > > > > Jason > > Hey Bharat, please have a look at the RFC for this and see if they specify > anything. > Spec has nothing much specified about the Byte Order or any details as such. https://tools.ietf.org/html/rfc7306#section-6 Thanks, Bharat. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday, July 07/27/18, 2018 at 00:33:51 +0530, Jason Gunthorpe wrote: > On Thu, Jul 26, 2018 at 12:51:06PM -0500, Steve Wise wrote: > > > > struct { > > > > - __be64 imm_data; > > > > + __be32 mo; > > > > + __be32 msn; > > > > + /* > > > > + * Use union for immediate data to be consistent > > with > > > > + * stack's 32 bit data and iWARP spec's 64 bit data. > > > > + */ > > > > + union { > > > > + struct { > > > > + u32 reserved; > > > > + __be32 imm_data32; > > > > + } ib_imm_data; > > > > + __be64 imm_data64; > > > > > > You guys should think carefully about this as this choice to put the > > > 32 bit version at the end becomes a permanent wire protocol ABI for > > > iWarp. > > > > > > The definition of 'imm data' is a memory image of what is put into the > > > packet, so placing the 32 bit version of this at the end of the array > > > on the wire seems like a strange choice. > > > > > > If we extend verbs, it will continue to be a memory image semantic, > > > and I would expect the 32 bits version to be the upper chunk of the > > > array, not the lower chunk. > > > > > > > Are you assuming an LE host? > > endianness doesn't matter, the above always orders the imm_data_32 in > the last 4 bytes of the imm_data. > > > Which brings up a point that 'imm data' is not very platform > > independent. But I guess putting it in the upper chunk is fine. > > imm_data is defined as a memory image. byte 0 of imm_data in the work > request goes into bytes 0 of the packet and comes out again at byte 0 > in the work completion. > > This is fully platform independent. > > Most apps choose to put a BE value in here, but any other > fixed-memory-order value would be fine. > > You should also make sure your byte ordering on the wire is right, as > this becomes ABI forever. Recommend to follow IB with byte 0 as byte 0 > in the packet. (I think, at least) > Got it. Shall interchange the imm_data32 and send out a v3. Thanks! -Bharat. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 26, 2018 at 02:16:00PM -0500, Steve Wise wrote: > > Most apps choose to put a BE value in here, but any other > > fixed-memory-order value would be fine. > > My point was, the applications need to "know" what endianness their peer is, > or both agree to put it in BE or LE. IE The Verbs API doesn't do this for > them and convert from host BO to a standard protocol-defined byte order. Yes, of course. What was done wrong here is defining it as a u32 at all, it really should have been uint8_t imm_data[4]; To make it clear it is just an opaque application controlled buffer. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday, July 07/27/18, 2018 at 02:40:31 +0530, Jason Gunthorpe wrote: > On Thu, Jul 26, 2018 at 02:16:00PM -0500, Steve Wise wrote: > > > > Most apps choose to put a BE value in here, but any other > > > fixed-memory-order value would be fine. > > > > My point was, the applications need to "know" what endianness their peer is, > > or both agree to put it in BE or LE. IE The Verbs API doesn't do this for > > them and convert from host BO to a standard protocol-defined byte order. > > Yes, of course. What was done wrong here is defining it as a u32 at > all, it really should have been > > uint8_t imm_data[4]; > > To make it clear it is just an opaque application controlled buffer. > Hi Jason, Does my v3 look good enough to get into for next branch? Thanks, Bharat. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 01, 2018 at 12:48:27AM +0530, Potnuri Bharat Teja wrote: > On Friday, July 07/27/18, 2018 at 02:40:31 +0530, Jason Gunthorpe wrote: > > On Thu, Jul 26, 2018 at 02:16:00PM -0500, Steve Wise wrote: > > > > > > Most apps choose to put a BE value in here, but any other > > > > fixed-memory-order value would be fine. > > > > > > My point was, the applications need to "know" what endianness their peer is, > > > or both agree to put it in BE or LE. IE The Verbs API doesn't do this for > > > them and convert from host BO to a standard protocol-defined byte order. > > > > Yes, of course. What was done wrong here is defining it as a u32 at > > all, it really should have been > > > > uint8_t imm_data[4]; > > > > To make it clear it is just an opaque application controlled buffer. > > > Hi Jason, > Does my v3 look good enough to get into for next branch? Did you run it through sparse/etc? I see Bart has sent fixes for the last cxgb4 series.. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday, August 08/01/18, 2018 at 04:25:17 +0530, Jason Gunthorpe wrote: > On Wed, Aug 01, 2018 at 12:48:27AM +0530, Potnuri Bharat Teja wrote: > > On Friday, July 07/27/18, 2018 at 02:40:31 +0530, Jason Gunthorpe wrote: > > > On Thu, Jul 26, 2018 at 02:16:00PM -0500, Steve Wise wrote: > > > > > > > > Most apps choose to put a BE value in here, but any other > > > > > fixed-memory-order value would be fine. > > > > > > > > My point was, the applications need to "know" what endianness their peer is, > > > > or both agree to put it in BE or LE. IE The Verbs API doesn't do this for > > > > them and convert from host BO to a standard protocol-defined byte order. > > > > > > Yes, of course. What was done wrong here is defining it as a u32 at > > > all, it really should have been > > > > > > uint8_t imm_data[4]; > > > > > > To make it clear it is just an opaque application controlled buffer. > > > > > Hi Jason, > > Does my v3 look good enough to get into for next branch? > > Did you run it through sparse/etc? I see Bart has sent fixes for the > last cxgb4 series.. Yes, I did check for sparse warnings. No warnings observed. -Bharat -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 64f70800945b..5792675150e0 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -816,15 +816,32 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) wc->byte_len = CQE_LEN(&cqe); else wc->byte_len = 0; - wc->opcode = IB_WC_RECV; - if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV || - CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { + + switch (CQE_OPCODE(&cqe)) { + case FW_RI_SEND: + wc->opcode = IB_WC_RECV; + break; + case FW_RI_SEND_WITH_INV: + case FW_RI_SEND_WITH_SE_INV: + wc->opcode = IB_WC_RECV; wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); wc->wc_flags |= IB_WC_WITH_INVALIDATE; c4iw_invalidate_mr(qhp->rhp, wc->ex.invalidate_rkey); + break; + case FW_RI_WRITE_IMMEDIATE: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->ex.imm_data = CQE_IMM_DATA(&cqe); + wc->wc_flags |= IB_WC_WITH_IMM; + break; + default: + pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(&cqe), CQE_QPID(&cqe)); + ret = -EINVAL; + goto out; } } else { switch (CQE_OPCODE(&cqe)) { + case FW_RI_WRITE_IMMEDIATE: case FW_RI_RDMA_WRITE: wc->opcode = IB_WC_RDMA_WRITE; break; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 08dc555942af..88bd67d7dc77 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -555,7 +555,15 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, if (wr->num_sge > T4_MAX_SEND_SGE) return -EINVAL; - wqe->write.r2 = 0; + + /* + * iWARP protocol supports 64 bit immediate data but rdma api + * limits it to 32bit. + */ + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = wr->ex.imm_data; + else + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = 0; wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); if (wr->num_sge) { @@ -846,6 +854,9 @@ static int ib_to_fw_opcode(int ib_opcode) case IB_WR_RDMA_WRITE: opcode = FW_RI_RDMA_WRITE; break; + case IB_WR_RDMA_WRITE_WITH_IMM: + opcode = FW_RI_WRITE_IMMEDIATE; + break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: opcode = FW_RI_READ_REQ; @@ -964,6 +975,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, enum fw_wr_opcodes fw_opcode = 0; enum fw_ri_wr_flags fw_flags; struct c4iw_qp *qhp; + struct c4iw_dev *rhp; union t4_wr *wqe = NULL; u32 num_wrs; struct t4_swsqe *swsqe; @@ -971,6 +983,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, u16 idx = 0; qhp = to_c4iw_qp(ibqp); + rhp = qhp->rhp; spin_lock_irqsave(&qhp->lock, flag); /* @@ -1015,6 +1028,13 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, swsqe->opcode = FW_RI_SEND_WITH_INV; err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16); break; + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!rhp->rdev.lldi.write_w_imm_support)) { + err = -EINVAL; + break; + } + fw_flags |= FW_RI_RDMA_WRITE_WITH_IMMEDIATE; + /*FALLTHROUGH*/ case IB_WR_RDMA_WRITE: fw_opcode = FW_RI_RDMA_WRITE_WR; swsqe->opcode = FW_RI_RDMA_WRITE; @@ -1025,8 +1045,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_opcode = FW_RI_RDMA_READ_WR; swsqe->opcode = FW_RI_READ_REQ; if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) { - c4iw_invalidate_mr(qhp->rhp, - wr->sg_list[0].lkey); + c4iw_invalidate_mr(rhp, wr->sg_list[0].lkey); fw_flags = FW_RI_RDMA_READ_INVALIDATE; } else { fw_flags = 0; @@ -1042,7 +1061,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr); swsqe->opcode = FW_RI_FAST_REGISTER; - if (qhp->rhp->rdev.lldi.fr_nsmr_tpte_wr_support && + if (rhp->rdev.lldi.fr_nsmr_tpte_wr_support && !mhp->attr.state && mhp->mpl_len <= 2) { fw_opcode = FW_RI_FR_NSMR_TPTE_WR; build_tpte_memreg(&wqe->fr_tpte, reg_wr(wr), @@ -1051,7 +1070,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_opcode = FW_RI_FR_NSMR_WR; err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr), mhp, &len16, - qhp->rhp->rdev.lldi.ulptx_memwrite_dsgl); + rhp->rdev.lldi.ulptx_memwrite_dsgl); if (err) break; } @@ -1064,7 +1083,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_opcode = FW_RI_INV_LSTAG_WR; swsqe->opcode = FW_RI_LOCAL_INV; err = build_inv_stag(wqe, wr, &len16); - c4iw_invalidate_mr(qhp->rhp, wr->ex.invalidate_rkey); + c4iw_invalidate_mr(rhp, wr->ex.invalidate_rkey); break; default: pr_warn("%s post of type=%d TBD!\n", __func__, @@ -1083,7 +1102,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, swsqe->wr_id = wr->wr_id; if (c4iw_wr_log) { swsqe->sge_ts = cxgb4_read_sge_timestamp( - qhp->rhp->rdev.lldi.ports[0]); + rhp->rdev.lldi.ports[0]); swsqe->host_time = ktime_get(); } @@ -1097,7 +1116,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, t4_sq_produce(&qhp->wq, len16); idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); } - if (!qhp->rhp->rdev.status_page->db_off) { + if (!rhp->rdev.status_page->db_off) { t4_ring_sq_db(&qhp->wq, idx, wqe); spin_unlock_irqrestore(&qhp->lock, flag); } else { @@ -2092,6 +2111,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, uresp.flags = C4IW_QPF_ONCHIP; } else uresp.flags = 0; + if (rhp->rdev.lldi.write_w_imm_support) + uresp.flags |= C4IW_QPF_WRITE_W_IMM; uresp.qid_mask = rhp->rdev.qpmask; uresp.sqid = qhp->wq.sq.qid; uresp.sq_size = qhp->wq.sq.size; diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 11d55fc2ded7..3fdcc445ff16 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -190,7 +190,19 @@ struct t4_cqe { __be32 abs_rqe_idx; } srcqe; struct { - __be64 imm_data; + __be32 mo; + __be32 msn; + /* + * Use union for immediate data to be consistent with + * stack's 32 bit data and iWARP spec's 64 bit data. + */ + union { + struct { + u32 reserved; + __be32 imm_data32; + } ib_imm_data; + __be64 imm_data64; + } iw_imm_data; } imm_data_rcqe; u64 drain_cookie; @@ -253,6 +265,8 @@ struct t4_cqe { #define CQE_WRID_STAG(x) (be32_to_cpu((x)->u.rcqe.stag)) #define CQE_WRID_MSN(x) (be32_to_cpu((x)->u.rcqe.msn)) #define CQE_ABS_RQE_IDX(x) (be32_to_cpu((x)->u.srcqe.abs_rqe_idx)) +#define CQE_IMM_DATA(x)( \ + (x)->u.imm_data_rcqe.iw_imm_data.ib_imm_data.imm_data32) /* used for SQ completion processing */ #define CQE_WRID_SQ_IDX(x) ((x)->u.scqe.cidx) diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h index 0f4f86b004d6..45e7211e94ac 100644 --- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -50,7 +50,8 @@ enum fw_ri_wr_opcode { FW_RI_BYPASS = 0xd, FW_RI_RECEIVE = 0xe, - FW_RI_SGE_EC_CR_RETURN = 0xf + FW_RI_SGE_EC_CR_RETURN = 0xf, + FW_RI_WRITE_IMMEDIATE = FW_RI_RDMA_INIT }; enum fw_ri_wr_flags { @@ -59,7 +60,8 @@ enum fw_ri_wr_flags { FW_RI_SOLICITED_EVENT_FLAG = 0x04, FW_RI_READ_FENCE_FLAG = 0x08, FW_RI_LOCAL_FENCE_FLAG = 0x10, - FW_RI_RDMA_READ_INVALIDATE = 0x20 + FW_RI_RDMA_READ_INVALIDATE = 0x20, + FW_RI_RDMA_WRITE_WITH_IMMEDIATE = 0x40 }; enum fw_ri_mpa_attrs { @@ -546,7 +548,17 @@ struct fw_ri_rdma_write_wr { __u16 wrid; __u8 r1[3]; __u8 len16; - __be64 r2; + /* + * Use union for immediate data to be consistent with stack's 32 bit + * data and iWARP spec's 64 bit data. + */ + union { + struct { + u32 reserved; + __be32 imm_data32; + } ib_imm_data; + __be64 imm_data64; + } iw_imm_data; __be32 plen; __be32 stag_sink; __be64 to_sink; diff --git a/include/uapi/rdma/cxgb4-abi.h b/include/uapi/rdma/cxgb4-abi.h index d0b2d829471a..f85ec1a3f727 100644 --- a/include/uapi/rdma/cxgb4-abi.h +++ b/include/uapi/rdma/cxgb4-abi.h @@ -65,7 +65,8 @@ struct c4iw_create_cq_resp { }; enum { - C4IW_QPF_ONCHIP = (1 << 0) + C4IW_QPF_ONCHIP = (1 << 0), + C4IW_QPF_WRITE_W_IMM = (1 << 1) }; struct c4iw_create_qp_resp {