From patchwork Fri Sep 16 14:54:52 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steve Wise X-Patchwork-Id: 9336051 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id E12396089F for ; Fri, 16 Sep 2016 14:59:47 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id D266829FFA for ; Fri, 16 Sep 2016 14:59:47 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id C6FC629FFF; Fri, 16 Sep 2016 14:59:47 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-6.9 required=2.0 tests=BAYES_00,RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id E7CEE29FFA for ; Fri, 16 Sep 2016 14:59:45 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S935310AbcIPO7V (ORCPT ); Fri, 16 Sep 2016 10:59:21 -0400 Received: from smtp.opengridcomputing.com ([72.48.136.20]:49730 "EHLO smtp.opengridcomputing.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S935281AbcIPO7M (ORCPT ); Fri, 16 Sep 2016 10:59:12 -0400 Received: from smtp.ogc.us (build2.ogc.int [10.10.0.32]) by smtp.opengridcomputing.com (Postfix) with ESMTP id 93E4629ECB; Fri, 16 Sep 2016 09:59:11 -0500 (CDT) Received: by smtp.ogc.us (Postfix, from userid 503) id 8837BE09DC; Fri, 16 Sep 2016 09:59:11 -0500 (CDT) Message-Id: <3153440ff068cc56a517af2ba6be2b0de0f4dd99.1474037695.git.swise@opengridcomputing.com> In-Reply-To: References: From: Steve Wise Date: Fri, 16 Sep 2016 07:54:52 -0700 Subject: [PATCH 2/2] iw_cxgb4: add fast-path for small REG_MR operations To: dledford@redhat.com, davem@davemloft.net Cc: netdev@vger.kernel.org, linux-rdma@vger.kernel.org Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP When processing a REG_MR work request, if fw supports the FW_RI_NSMR_TPTE_WR work request, and if the page list for this registration is <= 2 pages, and the current state of the mr is INVALID, then use FW_RI_NSMR_TPTE_WR to pass down a fully populated TPTE for FW to write. This avoids FW having to do an async read of the TPTE blocking the SQ until the read completes. To know if the current MR state is INVALID or not, iw_cxgb4 must track the state of each fastreg MR. The c4iw_mr struct state is updated as REG_MR and LOCAL_INV WRs are posted and completed, when a reg_mr is destroyed, and when RECV completions are processed that include a local invalidation. This optimization increases small IO IOPS for both iSER and NVMF. Signed-off-by: Steve Wise --- drivers/infiniband/hw/cxgb4/cq.c | 17 +++++++ drivers/infiniband/hw/cxgb4/mem.c | 2 +- drivers/infiniband/hw/cxgb4/qp.c | 67 +++++++++++++++++++++++---- drivers/infiniband/hw/cxgb4/t4.h | 4 +- drivers/infiniband/hw/cxgb4/t4fw_ri_api.h | 12 +++++ drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 1 + 6 files changed, 92 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index ac926c9..867b8cf 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -666,6 +666,18 @@ skip_cqe: return ret; } +static void invalidate_mr(struct c4iw_dev *rhp, u32 rkey) +{ + struct c4iw_mr *mhp; + unsigned long flags; + + spin_lock_irqsave(&rhp->lock, flags); + mhp = get_mhp(rhp, rkey >> 8); + if (mhp) + mhp->attr.state = 0; + spin_unlock_irqrestore(&rhp->lock, flags); +} + /* * Get one cq entry from c4iw and map it to openib. * @@ -721,6 +733,7 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); wc->wc_flags |= IB_WC_WITH_INVALIDATE; + invalidate_mr(qhp->rhp, wc->ex.invalidate_rkey); } } else { switch (CQE_OPCODE(&cqe)) { @@ -746,6 +759,10 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) break; case FW_RI_FAST_REGISTER: wc->opcode = IB_WC_REG_MR; + + /* Invalidate the MR if the fastreg failed */ + if (CQE_STATUS(&cqe) != T4_ERR_SUCCESS) + invalidate_mr(qhp->rhp, CQE_WRID_FR_STAG(&cqe)); break; default: printk(KERN_ERR MOD "Unexpected opcode %d " diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 0b91b0f..80e2774 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -695,7 +695,7 @@ struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, mhp->attr.pdid = php->pdid; mhp->attr.type = FW_RI_STAG_NSMR; mhp->attr.stag = stag; - mhp->attr.state = 1; + mhp->attr.state = 0; mmid = (stag) >> 8; mhp->ibmr.rkey = mhp->ibmr.lkey = stag; if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index edb1172..3467b90 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -609,10 +609,42 @@ static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, return 0; } +static void build_tpte_memreg(struct fw_ri_fr_nsmr_tpte_wr *fr, + struct ib_reg_wr *wr, struct c4iw_mr *mhp, + u8 *len16) +{ + __be64 *p = (__be64 *)fr->pbl; + + fr->r2 = cpu_to_be32(0); + fr->stag = cpu_to_be32(mhp->ibmr.rkey); + + fr->tpte.valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F | + FW_RI_TPTE_STAGKEY_V((mhp->ibmr.rkey & FW_RI_TPTE_STAGKEY_M)) | + FW_RI_TPTE_STAGSTATE_V(1) | + FW_RI_TPTE_STAGTYPE_V(FW_RI_STAG_NSMR) | + FW_RI_TPTE_PDID_V(mhp->attr.pdid)); + fr->tpte.locread_to_qpid = cpu_to_be32( + FW_RI_TPTE_PERM_V(c4iw_ib_to_tpt_access(wr->access)) | + FW_RI_TPTE_ADDRTYPE_V(FW_RI_VA_BASED_TO) | + FW_RI_TPTE_PS_V(ilog2(wr->mr->page_size) - 12)); + fr->tpte.nosnoop_pbladdr = cpu_to_be32(FW_RI_TPTE_PBLADDR_V( + PBL_OFF(&mhp->rhp->rdev, mhp->attr.pbl_addr)>>3)); + fr->tpte.dca_mwbcnt_pstag = cpu_to_be32(0); + fr->tpte.len_hi = cpu_to_be32(0); + fr->tpte.len_lo = cpu_to_be32(mhp->ibmr.length); + fr->tpte.va_hi = cpu_to_be32(mhp->ibmr.iova >> 32); + fr->tpte.va_lo_fbo = cpu_to_be32(mhp->ibmr.iova & 0xffffffff); + + p[0] = cpu_to_be64((u64)mhp->mpl[0]); + p[1] = cpu_to_be64((u64)mhp->mpl[1]); + + *len16 = DIV_ROUND_UP(sizeof(*fr), 16); +} + static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, - struct ib_reg_wr *wr, u8 *len16, bool dsgl_supported) + struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16, + bool dsgl_supported) { - struct c4iw_mr *mhp = to_c4iw_mr(wr->mr); struct fw_ri_immd *imdp; __be64 *p; int i; @@ -674,9 +706,12 @@ static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, return 0; } -static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, - u8 *len16) +static int build_inv_stag(struct c4iw_dev *dev, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16) { + struct c4iw_mr *mhp = get_mhp(dev, wr->ex.invalidate_rkey >> 8); + + mhp->attr.state = 0; wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); wqe->inv.r2 = 0; *len16 = DIV_ROUND_UP(sizeof wqe->inv, 16); @@ -816,18 +851,32 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, if (!qhp->wq.sq.oldest_read) qhp->wq.sq.oldest_read = swsqe; break; - case IB_WR_REG_MR: - fw_opcode = FW_RI_FR_NSMR_WR; + case IB_WR_REG_MR: { + struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr); + swsqe->opcode = FW_RI_FAST_REGISTER; - err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr), &len16, - qhp->rhp->rdev.lldi.ulptx_memwrite_dsgl); + if (qhp->rhp->rdev.lldi.fr_nsmr_tpte_wr_support && + !mhp->attr.state && mhp->mpl_len <= 2) { + fw_opcode = FW_RI_FR_NSMR_TPTE_WR; + build_tpte_memreg(&wqe->fr_tpte, reg_wr(wr), + mhp, &len16); + } else { + fw_opcode = FW_RI_FR_NSMR_WR; + err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr), + mhp, &len16, + qhp->rhp->rdev.lldi.ulptx_memwrite_dsgl); + if (err) + break; + } + mhp->attr.state = 1; break; + } case IB_WR_LOCAL_INV: if (wr->send_flags & IB_SEND_FENCE) fw_flags |= FW_RI_LOCAL_FENCE_FLAG; fw_opcode = FW_RI_INV_LSTAG_WR; swsqe->opcode = FW_RI_LOCAL_INV; - err = build_inv_stag(wqe, wr, &len16); + err = build_inv_stag(qhp->rhp, wqe, wr, &len16); break; default: PDBG("%s post of type=%d TBD!\n", __func__, diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 02173f4..862381a 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -95,6 +95,7 @@ union t4_wr { struct fw_ri_rdma_read_wr read; struct fw_ri_bind_mw_wr bind; struct fw_ri_fr_nsmr_wr fr; + struct fw_ri_fr_nsmr_tpte_wr fr_tpte; struct fw_ri_inv_lstag_wr inv; struct t4_status_page status; __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS]; @@ -170,7 +171,7 @@ struct t4_cqe { __be32 msn; } rcqe; struct { - u32 nada1; + u32 stag; u16 nada2; u16 cidx; } scqe; @@ -232,6 +233,7 @@ struct t4_cqe { /* used for SQ completion processing */ #define CQE_WRID_SQ_IDX(x) ((x)->u.scqe.cidx) +#define CQE_WRID_FR_STAG(x) (be32_to_cpu((x)->u.scqe.stag)) /* generic accessor macros */ #define CQE_WRID_HI(x) (be32_to_cpu((x)->u.gen.wrid_hi)) diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h index 1e26669..010c709 100644 --- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -669,6 +669,18 @@ struct fw_ri_fr_nsmr_wr { #define FW_RI_FR_NSMR_WR_DCACPU_G(x) \ (((x) >> FW_RI_FR_NSMR_WR_DCACPU_S) & FW_RI_FR_NSMR_WR_DCACPU_M) +struct fw_ri_fr_nsmr_tpte_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u32 r2; + __u32 stag; + struct fw_ri_tpte tpte; + __u64 pbl[2]; +}; + struct fw_ri_inv_lstag_wr { __u8 opcode; __u8 flags; diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h index 9164d20..3f46ca8 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h @@ -100,6 +100,7 @@ enum fw_wr_opcodes { FW_RI_RECV_WR = 0x17, FW_RI_BIND_MW_WR = 0x18, FW_RI_FR_NSMR_WR = 0x19, + FW_RI_FR_NSMR_TPTE_WR = 0x20, FW_RI_INV_LSTAG_WR = 0x1a, FW_ISCSI_TX_DATA_WR = 0x45, FW_LASTC2E_WR = 0x70