[for-next,06/24] IB/hfi1: OPFN parameter negotiation

Message ID	20180702152240.16400.63843.stgit@scvm10.sc.intel.com (mailing list archive)
State	Changes Requested
Headers	show Return-Path: <linux-rdma-owner@kernel.org> Subject: [PATCH for-next 06/24] IB/hfi1: OPFN parameter negotiation From: Dennis Dalessandro <dennis.dalessandro@intel.com> To: jgg@ziepe.ca, dledford@redhat.com Cc: Ashutosh Dixit <ashutosh.dixit@intel.com>, linux-rdma@vger.kernel.org, Mike Marciniszyn <mike.marciniszyn@intel.com>, Mitko Haralanov <mitko.haralanov@intel.com>, Kaike Wan <kaike.wan@intel.com> Date: Mon, 02 Jul 2018 08:22:46 -0700 Message-ID: <20180702152240.16400.63843.stgit@scvm10.sc.intel.com> In-Reply-To: <20180702151121.16400.24646.stgit@scvm10.sc.intel.com> References: <20180702151121.16400.24646.stgit@scvm10.sc.intel.com> User-Agent: StGit/0.17.1-18-g2e886-dirty MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk

diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 09762c5..359ede6 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -10,8 +10,8 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ eprom.o exp_rcv.o file_ops.o firmware.o \ - init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ - qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o tid_rdma.o trace.o \ + init.o intr.o mad.o mmu_rb.o opfn.o pcie.o pio.o pio_copy.o \ + platform.o qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o tid_rdma.o trace.o \ uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ verbs_txreq.o vnic_main.o vnic_sdma.o diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 2c19bf7..4d4077d 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -5222,6 +5222,17 @@ int is_bx(struct hfi1_devdata *dd) return (chip_rev_minor & 0xF0) == 0x10; } +/* return true is kernel urg disabled for rcd */ +bool is_urg_masked(struct hfi1_ctxtdata *rcd) +{ + u64 mask; + u32 is = IS_RCVURGENT_START + rcd->ctxt; + u8 bit = is % 64; + + mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64))); + return !(mask & BIT_ULL(bit)); +} + /* * Append string s to buffer buf. Arguments curp and len are the current * position and remaining length, respectively. diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 36b04d6..055cdbf 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1,7 +1,7 @@ #ifndef _CHIP_H #define _CHIP_H /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -791,6 +791,7 @@ void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd, u32 hdrqempty(struct hfi1_ctxtdata *rcd); int is_ax(struct hfi1_devdata *dd); int is_bx(struct hfi1_devdata *dd); +bool is_urg_masked(struct hfi1_ctxtdata *rcd); u32 read_physical_state(struct hfi1_devdata *dd); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); const char *opa_lstate_name(u32 lstate); diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c new file mode 100644 index 0000000..25c4226 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include "hfi.h" +#include "trace.h" +#include "qp.h" +#include "opfn.h" + +#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT) + +#define OPFN_CODE(code) BIT((code) - 1) +#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code) + +struct hfi1_opfn_type { + bool (*request)(struct rvt_qp *qp, u64 *data); + bool (*response)(struct rvt_qp *qp, u64 *data); + bool (*reply)(struct rvt_qp *qp, u64 data); + void (*error)(struct rvt_qp *qp); +}; + +static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = { + [STL_VERBS_EXTD_TID_RDMA] = { + .request = tid_rdma_conn_req, + .response = tid_rdma_conn_resp, + .reply = tid_rdma_conn_reply, + .error = tid_rdma_conn_error, + }, +}; + +static void opfn_schedule_conn_request(struct rvt_qp *qp); + +static bool hfi1_opfn_extended(u32 bth1) +{ + return !!(bth1 & IB_BTHE_E); +} + +static void opfn_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_atomic_wr wr; + struct ib_send_wr *bad_send_wr; + u16 mask, capcode; + struct hfi1_opfn_type *extd; + u64 data; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Exit if the extended bit is not set, or if nothing is requested, or + * if we have completed all requests, or if a previous request is in + * progress + */ + if (!priv->opfn.extended || !priv->opfn.requested || + priv->opfn.requested == priv->opfn.completed || priv->opfn.curr) + goto done; + + mask = priv->opfn.requested & ~priv->opfn.completed; + capcode = ilog2(mask & ~(mask - 1)) + 1; + if (capcode >= STL_VERBS_EXTD_MAX) { + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + extd = &hfi1_opfn_handlers[capcode]; + if (!extd || !extd->request || !extd->request(qp, &data)) { + /* + * Either there is no handler for this capability or the request + * packet could not be generated. Either way, mark it as done so + * we don't keep attempting to complete it. + */ + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + data = (data & ~0xf) | capcode; + + memset(&wr, 0, sizeof(wr)); + wr.wr.opcode = IB_WR_OPFN; + wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR; + wr.compare_add = data; + + priv->opfn.curr = capcode; /* A new request is now in progress */ + /* Drop opfn.lock before calling ib_post_send() */ + spin_unlock_irqrestore(&priv->opfn.lock, flags); + + ret = ib_post_send(&qp->ibqp, &wr.wr, &bad_send_wr); + if (ret) + goto err; + return; +err: + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * In case of an unexpected error return from ib_post_send + * clear opfn.curr and reschedule to try again + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; + opfn_schedule_conn_request(qp); +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_send_conn_request(struct work_struct *work) +{ + struct hfi1_opfn_data *od; + struct hfi1_qp_priv *qpriv; + + od = container_of(work, struct hfi1_opfn_data, opfn_work); + qpriv = container_of(od, struct hfi1_qp_priv, opfn); + + opfn_conn_request(qpriv->owner); +} + +/* + * When QP s_lock is held in the caller, the OPFN request must be scheduled + * to a different workqueue to avoid double locking QP s_lock in call to + * ib_post_send in opfn_conn_request + */ +static void opfn_schedule_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + /* XXX: should we be scheduling to a different workqueue? */ + schedule_work(&priv->opfn.opfn_work); +} + +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth) +{ + struct hfi1_qp_priv *priv = qp->priv; + u64 data = be64_to_cpu(ateth->compare_data); + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + capcode = data & 0xf; + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->response) { + e->atomic_data = capcode; + return; + } + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (priv->opfn.completed & OPFN_CODE(capcode)) { + /* + * We are receiving a request for a feature that has already + * been negotiated. This may mean that the other side has reset + */ + priv->opfn.completed &= ~OPFN_CODE(capcode); + if (extd->error) + extd->error(qp); + } + + if (extd->response(qp, &data)) + priv->opfn.completed |= OPFN_CODE(capcode); + e->atomic_data = (data & ~0xf) | capcode; + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + capcode = data & 0xf; + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Either there is no previous request or the reply is not for the + * current request + */ + if (!priv->opfn.curr || capcode != priv->opfn.curr) + goto done; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->reply) + goto clear; + + if (extd->reply(qp, data)) + priv->opfn.completed |= OPFN_CODE(capcode); +clear: + /* + * Clear opfn.curr to indicate that the previous request is no longer in + * progress + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd = NULL; + unsigned long flags; + u16 capcode; + + /* + * The QP has gone into the Error state. We have to invalidate all + * negotiated feature, including the one in progress (if any). The RC + * QP handling will clean the WQE for the connection request. + */ + spin_lock_irqsave(&priv->opfn.lock, flags); + while (priv->opfn.completed) { + capcode = priv->opfn.completed & ~(priv->opfn.completed - 1); + extd = &hfi1_opfn_handlers[ilog2(capcode) + 1]; + if (extd->error) + extd->error(qp); + priv->opfn.completed &= ~OPFN_CODE(capcode); + } + priv->opfn.extended = false; + priv->opfn.requested = 0; + priv->opfn.curr = STL_VERBS_EXTD_NONE; + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_qp_priv *priv = qp->priv; + unsigned long flags; + + if (attr_mask & IB_QP_RETRY_CNT) + priv->s_retry = attr->retry_cnt; + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct tid_rdma_params *local = &priv->tid_rdma.local; + + if (attr_mask & IB_QP_TIMEOUT) + priv->tid_retry_timeout_jiffies = qp->timeout_jiffies; + if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) || + qp->pmtu == enum_to_mtu(OPA_MTU_8192)) { + tid_rdma_opfn_init(qp, local); + /* + * We only want to set the OPFN requested bit when the + * QP transitions to RTS. + */ + if (attr_mask & IB_QP_STATE && + attr->qp_state == IB_QPS_RTS) { + priv->opfn.requested |= OPFN_MASK(TID_RDMA); + /* + * If the QP is transitioning to RTS and the + * opfn.completed for TID RDMA has already been + * set, the QP is being moved *back* into RTS. + * We can now renegotiate the TID RDMA + * parameters. + */ + if (priv->opfn.completed & + OPFN_MASK(TID_RDMA)) { + priv->opfn.completed &= + ~OPFN_MASK(TID_RDMA); + /* + * Since the opfn.completed bit was + * already set, it is safe to assume + * that the opfn.extended is also set. + */ + opfn_schedule_conn_request(qp); + } + } + } else { + memset(local, 0, sizeof(*local)); + } + } + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (!priv->opfn.extended && hfi1_opfn_extended(bth1) && + HFI1_CAP_IS_KSET(OPFN)) { + priv->opfn.extended = true; + if (qp->state == IB_QPS_RTS) + opfn_conn_request(qp); + } +} diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h index 656cae0..3c5b041 100644 --- a/drivers/infiniband/hw/hfi1/opfn.h +++ b/drivers/infiniband/hw/hfi1/opfn.h @@ -91,9 +91,35 @@ /* STL Verbs Extended */ #define IB_BTHE_E_SHIFT 24 +#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX + +struct ib_atomic_eth; + +enum hfi1_opfn_codes { + STL_VERBS_EXTD_NONE = 0, + STL_VERBS_EXTD_TID_RDMA, + STL_VERBS_EXTD_MAX +}; struct hfi1_opfn_data { bool extended; + u16 requested; + u16 completed; + enum hfi1_opfn_codes curr; + /* serialize opfn function calls */ + spinlock_t lock; + struct work_struct opfn_work; }; +/* WR opcode for OPFN */ +#define IB_WR_OPFN IB_WR_RESERVED3 + +void opfn_send_conn_request(struct work_struct *work); +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth); +void opfn_conn_reply(struct rvt_qp *qp, u64 data); +void opfn_conn_error(struct rvt_qp *qp); +void opfn_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask); +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1); + #endif /* _HFI1_OPFN_H */ diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 9b1e84a..dbf8728 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -58,6 +58,8 @@ #include "qp.h" #include "trace.h" #include "verbs_txreq.h" +#include "user_exp_rcv.h" +#include "tid_rdma.h" unsigned int hfi1_qp_table_size = 256; module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO); @@ -132,6 +134,12 @@ static int iowait_sleep( .qpt_support = BIT(IB_QPT_RC), }, +[IB_WR_OPFN] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_USE_RESERVE, +}, + }; static void flush_tx_list(struct rvt_qp *qp) @@ -279,6 +287,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); qp_set_16b(qp); } + + opfn_init(qp, attr, attr_mask); } /** @@ -647,6 +657,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_qp_priv_tid_free(rdi, qp); kfree(priv->s_ahg); kfree(priv); } @@ -687,6 +698,8 @@ void stop_send_queue(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; cancel_work_sync(&priv->s_iowait.iowork); + if (cancel_work_sync(&priv->tid_rdma.trigger_work)) + rvt_put_qp(qp); } void quiesce_qp(struct rvt_qp *qp) @@ -702,6 +715,10 @@ void notify_qp_reset(struct rvt_qp *qp) { qp->r_adefered = 0; clear_ahg(qp); + + /* Clear any OPFN state */ + if (qp->ibqp.qp_type == IB_QPT_RC) + opfn_conn_error(qp); } /* diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9a45117..1c7c609 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -57,6 +57,10 @@ /* cut down ridiculously long IB macro names */ #define OP(x) RC_OP(x) +static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 psn, u32 pmtu) { @@ -509,17 +513,19 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) * Don't allow more operations to be started * than the QP limits allow. */ - if (newreq) { - if (qp->s_num_rd_atomic >= - qp->s_max_rd_atomic) { - qp->s_flags |= RVT_S_WAIT_RDMAR; - goto bail; - } - qp->s_num_rd_atomic++; - if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) - qp->s_lsn++; + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; } - if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_num_rd_atomic++; + + /* FALLTHROUGH */ + case IB_WR_OPFN: + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_OPFN) { qp->s_state = OP(COMPARE_SWAP); put_ib_ateth_swap(wqe->atomic_wr.swap, &ohdr->u.atomic_eth); @@ -1039,6 +1045,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) */ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) { + struct hfi1_qp_priv *priv = qp->priv; struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); struct hfi1_ibport *ibp; @@ -1049,8 +1056,26 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); - rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + /* + * We need special handling for the OPFN request WQEs as + * they are not allowed to generate real user errors + */ + if (wqe->wr.opcode == IB_WR_OPFN) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + /* + * Call opfn_conn_reply() with capcode and + * remaining data as 0 to close out the + * current request + */ + opfn_conn_reply(qp, priv->opfn.curr); + wqe = do_rc_completion(qp, wqe, ibp); + qp->s_flags &= ~RVT_S_WAIT_ACK; + } else { + hfi1_send_complete(qp, wqe, + IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } return; } else { /* need to handle delayed completion */ return; @@ -1360,6 +1385,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 *vaddr = wqe->sg_list[0].vaddr; *vaddr = val; } + if (wqe->wr.opcode == IB_WR_OPFN) + opfn_conn_reply(qp, val); + if (qp->s_num_rd_atomic && (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || @@ -2064,6 +2092,8 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) return; is_fecn = process_ecn(qp, packet, false); + opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); + /* * Process responses (ACKs) before anything else. Note that the * packet sequence number will be for something in the send work @@ -2360,15 +2390,18 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) case OP(COMPARE_SWAP): case OP(FETCH_ADD): { - struct ib_atomic_eth *ateth; + struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; + u64 vaddr = get_ib_ateth_vaddr(ateth); + bool opfn = opcode == OP(COMPARE_SWAP) && + vaddr == HFI1_VERBS_E_ATOMIC_VADDR; struct rvt_ack_entry *e; - u64 vaddr; atomic64_t *maddr; u64 sdata; u32 rkey; u8 next; - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !opfn)) goto nack_inv; next = qp->r_head_ack_queue + 1; if (next > HFI1_MAX_RDMA_ATOMIC) @@ -2384,8 +2417,11 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } - ateth = &ohdr->u.atomic_eth; - vaddr = get_ib_ateth_vaddr(ateth); + /* Process OPFN special virtual address */ + if (opfn) { + opfn_conn_response(qp, e, ateth); + goto ack; + } if (unlikely(vaddr & (sizeof(u64) - 1))) goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); @@ -2404,6 +2440,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) sdata); rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; +ack: e->opcode = opcode; e->sent = 0; e->psn = psn; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 54b7492..0905c83 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -80,6 +80,205 @@ /* Maximum number of packets within a flow generation. */ #define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) +#define GENERATION_MASK 0xFFFFF + +static inline u32 mask_generation(u32 a) +{ + return a & GENERATION_MASK; +} + +/* Reserved generation value to set to unused flows for kernel contexts */ +#define KERN_GENERATION_RESERVED mask_generation(U32_MAX) + +#define TID_OPFN_QP_CTXT_MASK 0xff +#define TID_OPFN_QP_CTXT_SHIFT 56 +#define TID_OPFN_QP_KDETH_MASK 0xff +#define TID_OPFN_QP_KDETH_SHIFT 48 +#define TID_OPFN_MAX_LEN_MASK 0x7ff +#define TID_OPFN_MAX_LEN_SHIFT 37 +#define TID_OPFN_TIMEOUT_MASK 0x1f +#define TID_OPFN_TIMEOUT_SHIFT 32 +#define TID_OPFN_RESERVED_MASK 0x3f +#define TID_OPFN_RESERVED_SHIFT 26 +#define TID_OPFN_URG_MASK 0x1 +#define TID_OPFN_URG_SHIFT 25 +#define TID_OPFN_VER_MASK 0x7 +#define TID_OPFN_VER_SHIFT 22 +#define TID_OPFN_JKEY_MASK 0x3f +#define TID_OPFN_JKEY_SHIFT 16 +#define TID_OPFN_MAX_READ_MASK 0x3f +#define TID_OPFN_MAX_READ_SHIFT 10 +#define TID_OPFN_MAX_WRITE_MASK 0x3f +#define TID_OPFN_MAX_WRITE_SHIFT 4 + +/* + * OPFN TID layout + * + * 63 47 31 15 + * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC + * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 + * N - the context Number + * K - the Kdeth_qp + * M - Max_len + * T - Timeout + * D - reserveD + * V - version + * U - Urg capable + * J - Jkey + * R - max_Read + * W - max_Write + * C - Capcode + */ + +static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) +{ + return + (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << + TID_OPFN_QP_CTXT_SHIFT) | + ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << + TID_OPFN_QP_KDETH_SHIFT) | + (((u64)((p->max_len >> PAGE_SHIFT) - 1) & + TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | + (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << + TID_OPFN_TIMEOUT_SHIFT) | + (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | + (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | + (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << + TID_OPFN_MAX_READ_SHIFT) | + (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << + TID_OPFN_MAX_WRITE_SHIFT); +} + +static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) +{ + p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & + TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; + p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; + p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & + TID_OPFN_MAX_WRITE_MASK; + p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & + TID_OPFN_MAX_READ_MASK; + p->qp = + ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) + << 16) | + ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); + p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; + p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; +} + +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) +{ + struct hfi1_qp_priv *priv = qp->priv; + + p->qp = (kdeth_qp << 16) | priv->rcd->ctxt; + p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; + p->jkey = priv->rcd->jkey; + p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; + p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; + p->timeout = qp->timeout; + p->urg = is_urg_masked(priv->rcd); +} + +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) +{ + struct hfi1_qp_priv *priv = qp->priv; + + *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); + return true; +} + +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *remote, *old; + bool ret = true; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + data &= ~0xf; + /* + * If data passed in is zero, return true so as not to continue the + * negotiation process + */ + if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) + goto null; + /* + * If kzalloc fails, return false. This will result in: + * * at the requester a new OPFN request being generated to retry + * the negotiation + * * at the responder, 0 being returned to the requester so as to + * disable TID RDMA at both the requester and the responder + */ + remote = kzalloc(sizeof(*remote), GFP_ATOMIC); + if (!remote) { + ret = false; + goto null; + } + + tid_rdma_opfn_decode(remote, data); + priv->tid_timer_timeout_jiffies = + usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / + 1000UL) << 3) * 7); + rcu_assign_pointer(priv->tid_rdma.remote, remote); + /* + * A TID RDMA READ request's segment size is not equal to + * remote->max_len only when the request's data length is smaller + * than remote->max_len. In that case, there will be only one segment. + * Therefore, when priv->pkts_ps is used to calculate req->cur_seg + * during retry, it will lead to req->cur_seg = 0, which is exactly + * what is expected. + */ + priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); + priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; + goto free; +null: + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + priv->timeout_shift = 0; +free: + if (old) + kfree_rcu(old, rcu_head); + return ret; +} + +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) +{ + bool ret; + + ret = tid_rdma_conn_reply(qp, *data); + *data = 0; + /* + * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate + * TID RDMA could not be enabled. This will result in TID RDMA being + * disabled at the requester too. + */ + if (ret) + (void)tid_rdma_conn_req(qp, data); + return ret; +} + +void tid_rdma_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *old; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + if (old) + kfree_rcu(old, rcu_head); +} + +/** + * tid_rdma_trigger_resume - field a trigger work request + * @work - the work item + * + * Complete the off qp trigger processing by directly + * calling the progress routine. + */ +static void tid_rdma_trigger_resume(struct work_struct *work) +{ +} + void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) { } @@ -136,5 +335,34 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->rcd = qp_to_rcd(rdi, qp); + spin_lock_init(&qpriv->opfn.lock); + INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); + INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); + qpriv->flow_state.psn = 0; + qpriv->flow_state.index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + + if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct hfi1_devdata *dd = qpriv->rcd->dd; + + qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * + sizeof(*qpriv->pages), + GFP_KERNEL, dd->node); + if (!qpriv->pages) + return -ENOMEM; + } + return 0; } + +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + cancel_work_sync(&priv->opfn.opfn_work); + kfree(priv->pages); + priv->pages = NULL; + } +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 554d228..89acc4f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -248,6 +248,11 @@ struct trdma_flow_state { u32 tid_entry[TID_RDMA_MAX_PAGES]; }; +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); +void tid_rdma_conn_error(struct rvt_qp *qp); + void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd); void tid_rdma_flush_wait(struct rvt_qp *qp); @@ -268,5 +273,8 @@ struct trdma_flow_state { int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); + +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index bdf5336..4e6832d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -65,6 +65,7 @@ #include "vnic.h" #include "fault.h" #include "affinity.h" +#include "tid_rdma.h" static unsigned int hfi1_lkey_table_size = 16; module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, @@ -1960,6 +1961,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.reserved_operations = 1; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index ef6f336..7f95e43 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -72,6 +72,8 @@ #include "iowait.h" #include "tid_rdma.h" +#include "opfn.h" +#include "common.h" #define HFI1_MAX_RDMA_ATOMIC 16 @@ -168,6 +170,11 @@ struct hfi1_qp_priv { struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ + unsigned long tid_timer_timeout_jiffies; + unsigned long tid_retry_timeout_jiffies; + u8 s_retry; + u16 pkts_ps; /* packets per segment */ + u8 timeout_shift; /* account for number of packets per segment */ }; /*

[for-next,06/24] IB/hfi1: OPFN parameter negotiation

Commit Message

Patch