diff mbox series

[v5,08/13] SIW queue pair methods

Message ID 20190219100903.15408-9-bmt@zurich.ibm.com (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series SIW: Request for Comments | expand

Commit Message

Bernard Metzler Feb. 19, 2019, 10:08 a.m. UTC
Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
---
 drivers/infiniband/sw/siw/siw_qp.c | 1478 ++++++++++++++++++++++++++++
 1 file changed, 1478 insertions(+)
 create mode 100644 drivers/infiniband/sw/siw/siw_qp.c

Comments

Leon Romanovsky Feb. 24, 2019, 1:19 p.m. UTC | #1
On Tue, Feb 19, 2019 at 11:08:58AM +0100, Bernard Metzler wrote:
> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> ---
>  drivers/infiniband/sw/siw/siw_qp.c | 1478 ++++++++++++++++++++++++++++
>  1 file changed, 1478 insertions(+)
>  create mode 100644 drivers/infiniband/sw/siw/siw_qp.c
>
> diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c
> new file mode 100644
> index 000000000000..75fd151dae39
> --- /dev/null
> +++ b/drivers/infiniband/sw/siw/siw_qp.c
> @@ -0,0 +1,1478 @@
> +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
> +/*
> + * Software iWARP device driver
> + *
> + * Authors: Bernard Metzler <bmt@zurich.ibm.com>
> + *
> + * Copyright (c) 2008-2018, IBM Corporation
> + *
> + * This software is available to you under a choice of one of two
> + * licenses. You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * BSD license below:
> + *
> + *   Redistribution and use in source and binary forms, with or
> + *   without modification, are permitted provided that the following
> + *   conditions are met:
> + *
> + *   - Redistributions of source code must retain the above copyright notice,
> + *     this list of conditions and the following disclaimer.
> + *
> + *   - Redistributions in binary form must reproduce the above copyright
> + *     notice, this list of conditions and the following disclaimer in the
> + *     documentation and/or other materials provided with the distribution.
> + *
> + *   - Neither the name of IBM nor the names of its contributors may be
> + *     used to endorse or promote products derived from this software without
> + *     specific prior written permission.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#include <linux/errno.h>
> +#include <linux/types.h>
> +#include <linux/net.h>
> +#include <linux/file.h>
> +#include <linux/scatterlist.h>
> +#include <linux/highmem.h>
> +#include <linux/vmalloc.h>
> +#include <asm/barrier.h>
> +#include <net/sock.h>
> +#include <net/tcp_states.h>
> +#include <net/tcp.h>
> +
> +#include <rdma/iw_cm.h>
> +#include <rdma/ib_verbs.h>
> +#include <rdma/ib_smi.h>
> +#include <rdma/ib_user_verbs.h>
> +
> +#include "siw.h"
> +#include "siw_obj.h"
> +#include "siw_cm.h"
> +
> +static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
> +	[SIW_QP_STATE_IDLE]		= "IDLE",
> +	[SIW_QP_STATE_RTR]		= "RTR",
> +	[SIW_QP_STATE_RTS]		= "RTS",
> +	[SIW_QP_STATE_CLOSING]		= "CLOSING",
> +	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
> +	[SIW_QP_STATE_ERROR]		= "ERROR"
> +};
> +
> +/*
> + * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
> + * per-RDMAP message basis. Please keep order of initializer. All MPA len
> + * is initialized to minimum packet size.
> + */
> +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { {
> +	/* RDMAP_RDMA_WRITE */
> +	.hdr_len = sizeof(struct iwarp_rdma_write),
> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
> +		| cpu_to_be16(DDP_VERSION << 8)
> +		| cpu_to_be16(RDMAP_VERSION << 6)
> +		| cpu_to_be16(RDMAP_RDMA_WRITE),
> +	.proc_data = siw_proc_write
> +},
> +{	/* RDMAP_RDMA_READ_REQ */
> +	.hdr_len = sizeof(struct iwarp_rdma_rreq),
> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> +		| cpu_to_be16(DDP_VERSION << 8)
> +		| cpu_to_be16(RDMAP_VERSION << 6)
> +		| cpu_to_be16(RDMAP_RDMA_READ_REQ),
> +	.proc_data = siw_proc_rreq
> +},
> +{	/* RDMAP_RDMA_READ_RESP */
> +	.hdr_len = sizeof(struct iwarp_rdma_rresp),
> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
> +		| cpu_to_be16(DDP_VERSION << 8)
> +		| cpu_to_be16(RDMAP_VERSION << 6)
> +		| cpu_to_be16(RDMAP_RDMA_READ_RESP),
> +	.proc_data = siw_proc_rresp
> +},
> +{	/* RDMAP_SEND */
> +	.hdr_len = sizeof(struct iwarp_send),
> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> +		| cpu_to_be16(DDP_VERSION << 8)
> +		| cpu_to_be16(RDMAP_VERSION << 6)
> +		| cpu_to_be16(RDMAP_SEND),
> +	.proc_data = siw_proc_send
> +},
> +{	/* RDMAP_SEND_INVAL */
> +	.hdr_len = sizeof(struct iwarp_send_inv),
> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> +		| cpu_to_be16(DDP_VERSION << 8)
> +		| cpu_to_be16(RDMAP_VERSION << 6)
> +		| cpu_to_be16(RDMAP_SEND_INVAL),
> +	.proc_data = siw_proc_send
> +},
> +{	/* RDMAP_SEND_SE */
> +	.hdr_len = sizeof(struct iwarp_send),
> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> +		| cpu_to_be16(DDP_VERSION << 8)
> +		| cpu_to_be16(RDMAP_VERSION << 6)
> +		| cpu_to_be16(RDMAP_SEND_SE),
> +	.proc_data = siw_proc_send
> +},
> +{	/* RDMAP_SEND_SE_INVAL */
> +	.hdr_len = sizeof(struct iwarp_send_inv),
> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> +		| cpu_to_be16(DDP_VERSION << 8)
> +		| cpu_to_be16(RDMAP_VERSION << 6)
> +		| cpu_to_be16(RDMAP_SEND_SE_INVAL),
> +	.proc_data = siw_proc_send
> +},
> +{	/* RDMAP_TERMINATE */
> +	.hdr_len = sizeof(struct iwarp_terminate),
> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> +		| cpu_to_be16(DDP_VERSION << 8)
> +		| cpu_to_be16(RDMAP_VERSION << 6)
> +		| cpu_to_be16(RDMAP_TERMINATE),
> +	.proc_data = siw_proc_terminate
> +} };
> +
> +void siw_qp_llp_data_ready(struct sock *sk)
> +{
> +	struct siw_qp		*qp;
> +
> +	read_lock(&sk->sk_callback_lock);
> +
> +	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
> +		goto done;
> +
> +	qp = sk_to_qp(sk);
> +
> +	if (likely(!qp->rx_ctx.rx_suspend &&
> +		   down_read_trylock(&qp->state_lock))) {
> +		read_descriptor_t rd_desc = {.arg.data = qp, .count = 1};
> +
> +		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
> +			/*
> +			 * Implements data receive operation during
> +			 * socket callback. TCP gracefully catches
> +			 * the case where there is nothing to receive
> +			 * (not calling siw_tcp_rx_data() then).
> +			 */
> +			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
> +
> +		up_read(&qp->state_lock);
> +	} else {
> +		siw_dbg_qp(qp, "unable to rx, suspend: %d\n",
> +			   qp->rx_ctx.rx_suspend);
> +	}
> +done:
> +	read_unlock(&sk->sk_callback_lock);
> +}
> +
> +void siw_qp_llp_close(struct siw_qp *qp)
> +{
> +	siw_dbg_qp(qp, "enter llp close, state = %s\n",
> +		   siw_qp_state_to_string[qp->attrs.state]);
> +
> +	down_write(&qp->state_lock);
> +
> +	qp->rx_ctx.rx_suspend = 1;
> +	qp->tx_ctx.tx_suspend = 1;
> +	qp->attrs.sk = NULL;
> +
> +	switch (qp->attrs.state) {
> +
> +	case SIW_QP_STATE_RTS:
> +	case SIW_QP_STATE_RTR:
> +	case SIW_QP_STATE_IDLE:
> +	case SIW_QP_STATE_TERMINATE:
> +
> +		qp->attrs.state = SIW_QP_STATE_ERROR;
> +
> +		break;
> +	/*
> +	 * SIW_QP_STATE_CLOSING:
> +	 *
> +	 * This is a forced close. shall the QP be moved to
> +	 * ERROR or IDLE ?
> +	 */
> +	case SIW_QP_STATE_CLOSING:
> +		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> +		else
> +			qp->attrs.state = SIW_QP_STATE_IDLE;
> +
> +		break;
> +
> +	default:
> +		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
> +			   siw_qp_state_to_string[qp->attrs.state]);
> +		break;
> +	}
> +	siw_sq_flush(qp);
> +	siw_rq_flush(qp);
> +
> +	/*
> +	 * Dereference closing CEP
> +	 */
> +	if (qp->cep) {
> +		siw_cep_put(qp->cep);
> +		qp->cep = NULL;
> +	}
> +
> +	up_write(&qp->state_lock);
> +
> +	siw_dbg_qp(qp, "llp close exit: state %s\n",
> +		   siw_qp_state_to_string[qp->attrs.state]);
> +}
> +
> +/*
> + * socket callback routine informing about newly available send space.
> + * Function schedules SQ work for processing SQ items.
> + */
> +void siw_qp_llp_write_space(struct sock *sk)
> +{
> +	struct siw_cep	*cep = sk_to_cep(sk);
> +
> +	cep->sk_write_space(sk);
> +
> +	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
> +		(void) siw_sq_start(cep->qp);
> +}
> +
> +static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
> +{
> +	if (!irq_size)
> +		irq_size = 1;
> +	if (!orq_size)
> +		orq_size = 1;
> +
> +	qp->attrs.irq_size = irq_size;
> +	qp->attrs.orq_size = orq_size;
> +
> +	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
> +	if (!qp->irq) {
> +		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
> +		qp->attrs.irq_size = 0;
> +		return -ENOMEM;
> +	}
> +	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
> +	if (!qp->orq) {
> +		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
> +		qp->attrs.orq_size = 0;
> +		qp->attrs.irq_size = 0;
> +		vfree(qp->irq);
> +		return -ENOMEM;
> +	}
> +	return 0;
> +}
> +
> +static int siw_qp_enable_crc(struct siw_qp *qp)
> +{
> +	struct siw_iwarp_rx *c_rx = &qp->rx_ctx;
> +	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
> +	int rv = 0;
> +
> +	if (siw_crypto_shash == NULL) {
> +		rv = -ENOENT;
> +		goto error;
> +	}
> +	c_tx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
> +				   crypto_shash_descsize(siw_crypto_shash),
> +				   GFP_KERNEL);
> +	c_rx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
> +				   crypto_shash_descsize(siw_crypto_shash),
> +				   GFP_KERNEL);
> +	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
> +		rv = -ENOMEM;
> +		goto error;
> +	}
> +	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
> +	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
> +
> +	return 0;
> +error:
> +	siw_dbg_qp(qp, "falied loading crc32c. error %d\n", rv);
> +
> +	kfree(c_tx->mpa_crc_hd);
> +	kfree(c_rx->mpa_crc_hd);
> +
> +	c_tx->mpa_crc_hd = c_rx->mpa_crc_hd = NULL;
> +
> +	return rv;
> +}
> +
> +/*
> + * Send a non signalled READ or WRITE to peer side as negotiated
> + * with MPAv2 P2P setup protocol. The work request is only created
> + * as a current active WR and does not consume Send Queue space.
> + *
> + * Caller must hold QP state lock.
> + */
> +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
> +{
> +	struct siw_wqe	*wqe = tx_wqe(qp);
> +	unsigned long flags;
> +	int rv = 0;
> +
> +	spin_lock_irqsave(&qp->sq_lock, flags);
> +
> +	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
> +		spin_unlock_irqrestore(&qp->sq_lock, flags);
> +		return -EIO;
> +	}
> +	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> +
> +	wqe->wr_status = SIW_WR_QUEUED;
> +	wqe->sqe.flags = 0;
> +	wqe->sqe.num_sge = 1;
> +	wqe->sqe.sge[0].length = 0;
> +	wqe->sqe.sge[0].laddr = 0;
> +	wqe->sqe.sge[0].lkey = 0;
> +	/*
> +	 * While it must not be checked for inbound zero length
> +	 * READ/WRITE, some HW may treat STag 0 special.
> +	 */
> +	wqe->sqe.rkey = 1;
> +	wqe->sqe.raddr = 0;
> +	wqe->processed = 0;
> +
> +	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
> +		wqe->sqe.opcode = SIW_OP_WRITE;
> +	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
> +		struct siw_sqe	*rreq;
> +
> +		wqe->sqe.opcode = SIW_OP_READ;
> +
> +		spin_lock(&qp->orq_lock);
> +
> +		rreq = orq_get_free(qp);
> +		if (rreq) {
> +			siw_read_to_orq(rreq, &wqe->sqe);
> +			qp->orq_put++;
> +		} else
> +			rv = -EIO;
> +
> +		spin_unlock(&qp->orq_lock);
> +	} else
> +		rv = -EINVAL;
> +
> +	if (rv)
> +		wqe->wr_status = SIW_WR_IDLE;
> +
> +	spin_unlock_irqrestore(&qp->sq_lock, flags);
> +
> +	if (!rv)
> +		rv = siw_sq_start(qp);
> +
> +	return rv;
> +}
> +
> +/*
> + * Map memory access error to DDP tagged error
> + */
> +enum ddp_ecode siw_tagged_error(enum siw_access_state state)
> +{
> +	if (state == E_STAG_INVALID)
> +		return DDP_ECODE_T_INVALID_STAG;
> +	if (state == E_BASE_BOUNDS)
> +		return DDP_ECODE_T_BASE_BOUNDS;
> +	if (state == E_PD_MISMATCH)
> +		return DDP_ECODE_T_STAG_NOT_ASSOC;
> +	if (state == E_ACCESS_PERM)
> +		/*
> +		 * RFC 5041 (DDP) lacks an ecode for insufficient access
> +		 * permissions. 'Invalid STag' seem to be the closest
> +		 * match though.
> +		 */
> +		return DDP_ECODE_T_INVALID_STAG;
> +
> +	WARN_ON(1);
> +
> +	return DDP_ECODE_T_INVALID_STAG;
> +}
> +
> +/*
> + * Map memory access error to RDMAP protection error
> + */
> +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
> +{
> +	if (state == E_STAG_INVALID)
> +		return RDMAP_ECODE_INVALID_STAG;
> +	if (state == E_BASE_BOUNDS)
> +		return RDMAP_ECODE_BASE_BOUNDS;
> +	if (state == E_PD_MISMATCH)
> +		return RDMAP_ECODE_STAG_NOT_ASSOC;
> +	if (state == E_ACCESS_PERM)
> +		return RDMAP_ECODE_ACCESS_RIGHTS;
> +
> +	return RDMAP_ECODE_UNSPECIFIED;
> +}
> +
> +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer,
> +			u8 etype, u8 ecode, int in_tx)
> +{
> +	if (!qp->term_info.valid) {
> +		memset(&qp->term_info, 0, sizeof(qp->term_info));
> +		qp->term_info.layer = layer;
> +		qp->term_info.etype = etype;
> +		qp->term_info.ecode = ecode;
> +		qp->term_info.in_tx = in_tx;
> +		qp->term_info.valid = 1;
> +	}
> +	siw_dbg_qp(qp,
> +		   "init TERM: layer %d, type %d, code %d, in tx %s\n",
> +		   layer, etype, ecode, in_tx ? "yes" : "no");
> +}
> +
> +/*
> + * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
> + * Sending TERMINATE messages is best effort - such messages
> + * can only be send if the QP is still connected and it does
> + * not have another outbound message in-progress, i.e. the
> + * TERMINATE message must not interfer with an incomplete current
> + * transmit operation.
> + */
> +void siw_send_terminate(struct siw_qp *qp)
> +{
> +	struct kvec		iov[3];
> +	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT|MSG_EOR};
> +	struct iwarp_terminate	*term = NULL;
> +	union iwarp_hdr		*err_hdr = NULL;
> +	struct socket		*s = qp->attrs.sk;
> +	struct siw_iwarp_rx	*rx_ctx = &qp->rx_ctx;
> +	union iwarp_hdr		*rx_hdr = &rx_ctx->hdr;
> +	u32 crc = 0;
> +	int num_frags, len_terminate, rv;
> +
> +	if (!qp->term_info.valid)
> +		return;
> +
> +	qp->term_info.valid = 0;
> +
> +	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
> +		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
> +			   tx_type(tx_wqe(qp)));
> +		return;
> +	}
> +	if (!s && qp->cep)
> +		/* QP not yet in RTS. Take socket from connection end point */
> +		s = qp->cep->llp.sock;
> +
> +	if (!s) {
> +		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
> +		return;
> +	}
> +
> +	term = kzalloc(sizeof(*term), GFP_KERNEL);
> +	if (!term)
> +		return;
> +
> +	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
> +	term->ddp_mo = 0;
> +	term->ddp_msn = cpu_to_be32(1);
> +
> +	iov[0].iov_base = term;
> +	iov[0].iov_len = sizeof(*term);
> +
> +	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
> +	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
> +	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
> +		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
> +		if (!err_hdr) {
> +			kfree(term);
> +			return;
> +		}
> +	}
> +	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
> +	       sizeof(struct iwarp_ctrl));
> +
> +	__rdmap_term_set_layer(term, qp->term_info.layer);
> +	__rdmap_term_set_etype(term, qp->term_info.etype);
> +	__rdmap_term_set_ecode(term, qp->term_info.ecode);
> +
> +	switch (qp->term_info.layer) {
> +
> +	case TERM_ERROR_LAYER_RDMAP:
> +		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
> +			/* No additional DDP/RDMAP header to be included */
> +			break;
> +
> +		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
> +			/*
> +			 * Complete RDMAP frame will get attached, and
> +			 * DDP segment length is valid
> +			 */
> +			term->flag_m = 1;
> +			term->flag_d = 1;
> +			term->flag_r = 1;
> +
> +			if (qp->term_info.in_tx) {
> +				struct iwarp_rdma_rreq *rreq;
> +				struct siw_wqe *wqe = tx_wqe(qp);
> +
> +				/* Inbound RREQ error, detected during
> +				 * RRESP creation. Take state from
> +				 * current TX work queue element to
> +				 * reconstruct peers RREQ.
> +				 */
> +				rreq = (struct iwarp_rdma_rreq *)err_hdr;
> +
> +				memcpy(&rreq->ctrl,
> +				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
> +				       sizeof(struct iwarp_ctrl));
> +
> +				rreq->rsvd = 0;
> +				rreq->ddp_qn =
> +					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
> +
> +				/* Provide RREQ's MSN as kept aside */
> +				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
> +
> +				rreq->ddp_mo = htonl(wqe->processed);
> +				rreq->sink_stag = htonl(wqe->sqe.rkey);
> +				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
> +				rreq->read_size = htonl(wqe->sqe.sge[0].length);
> +				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
> +				rreq->source_to =
> +					cpu_to_be64(wqe->sqe.sge[0].laddr);
> +
> +				iov[1].iov_base = rreq;
> +				iov[1].iov_len = sizeof(*rreq);
> +
> +				rx_hdr = (union iwarp_hdr *)rreq;
> +			} else {
> +				/* Take RDMAP/DDP information from
> +				 * current (failed) inbound frame.
> +				 */
> +				iov[1].iov_base = rx_hdr;
> +
> +				if (__rdmap_opcode(&rx_hdr->ctrl) ==
> +				    RDMAP_RDMA_READ_REQ)
> +					iov[1].iov_len =
> +						sizeof(struct iwarp_rdma_rreq);
> +				else /* SEND type */
> +					iov[1].iov_len =
> +						sizeof(struct iwarp_send);
> +			}
> +		} else {
> +			/* Do not report DDP hdr information if packet
> +			 * layout is unknown
> +			 */
> +			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
> +			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
> +				break;
> +
> +			iov[1].iov_base = rx_hdr;
> +
> +			/* Only DDP frame will get attached */
> +			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
> +				iov[1].iov_len =
> +					sizeof(struct iwarp_rdma_write);
> +			else
> +				iov[1].iov_len = sizeof(struct iwarp_send);
> +
> +			term->flag_m = 1;
> +			term->flag_d = 1;
> +		}
> +		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
> +
> +		break;
> +
> +	case TERM_ERROR_LAYER_DDP:
> +		/* Report error encountered while DDP processing.
> +		 * This can only happen as a result of inbound
> +		 * DDP processing
> +		 */
> +
> +		/* Do not report DDP hdr information if packet
> +		 * layout is unknown
> +		 */
> +		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
> +		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
> +		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
> +		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
> +			break;
> +
> +		iov[1].iov_base = rx_hdr;
> +
> +		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
> +		else
> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
> +
> +		term->flag_m = 1;
> +		term->flag_d = 1;
> +
> +		break;
> +
> +	default:
> +		break;
> +
> +	}
> +	if (term->flag_m || term->flag_d || term->flag_r) {
> +		iov[2].iov_base = &crc;
> +		iov[2].iov_len = sizeof(crc);
> +		len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
> +		num_frags = 3;
> +	} else {
> +		iov[1].iov_base = &crc;
> +		iov[1].iov_len = sizeof(crc);
> +		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
> +		num_frags = 2;
> +	}
> +
> +	/* Adjust DDP Segment Length parameter, if valid */
> +	if (term->flag_m) {
> +		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
> +		enum rdma_opcode op = __rdmap_opcode(&rx_hdr->ctrl);
> +
> +		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
> +		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
> +	}
> +
> +	term->ctrl.mpa_len = cpu_to_be16(len_terminate -
> +					 (MPA_HDR_SIZE + MPA_CRC_SIZE));
> +	if (qp->tx_ctx.mpa_crc_hd) {
> +		crypto_shash_init(rx_ctx->mpa_crc_hd);
> +		if (siw_crc_array(rx_ctx->mpa_crc_hd, (u8 *)iov[0].iov_base,
> +				  iov[0].iov_len))
> +			goto out;
> +
> +		if (num_frags == 3) {
> +			if (siw_crc_array(rx_ctx->mpa_crc_hd,
> +					  (u8 *)iov[1].iov_base,
> +					  iov[1].iov_len))
> +				goto out;
> +		}
> +		crypto_shash_final(rx_ctx->mpa_crc_hd, (u8 *)&crc);
> +	}
> +
> +	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
> +	siw_dbg_qp(qp,
> +		   "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
> +		   rv == len_terminate ? "success" : "failure",
> +		   __rdmap_term_layer(term), __rdmap_term_etype(term),
> +		   __rdmap_term_ecode(term), rv);
> +out:
> +	kfree(term);
> +	kfree(err_hdr);
> +}
> +
> +/*
> + * handle all attrs other than state
> + */
> +static void siw_qp_modify_nonstate(struct siw_qp *qp,
> +				   struct siw_qp_attrs *attrs,
> +				   enum siw_qp_attr_mask mask)
> +{
> +	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
> +		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
> +			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
> +		else
> +			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
> +
> +		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
> +			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
> +		else
> +			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
> +
> +		if (attrs->flags & SIW_RDMA_READ_ENABLED)
> +			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
> +		else
> +			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
> +	}
> +}
> +
> +/*
> + * caller holds qp->state_lock
> + */
> +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
> +		  enum siw_qp_attr_mask mask)
> +{
> +	int	drop_conn = 0, rv = 0;
> +
> +	if (!mask)
> +		return 0;
> +
> +	siw_dbg_qp(qp, "state: %s => %s\n",
> +		   siw_qp_state_to_string[qp->attrs.state],
> +		   siw_qp_state_to_string[attrs->state]);
> +
> +	if (mask != SIW_QP_ATTR_STATE)
> +		siw_qp_modify_nonstate(qp, attrs, mask);
> +
> +	if (!(mask & SIW_QP_ATTR_STATE))
> +		return 0;
> +
> +	switch (qp->attrs.state) {
> +
> +	case SIW_QP_STATE_IDLE:
> +	case SIW_QP_STATE_RTR:
> +
> +		switch (attrs->state) {
> +
> +		case SIW_QP_STATE_RTS:
> +
> +			if (attrs->flags & SIW_MPA_CRC) {
> +				rv = siw_qp_enable_crc(qp);
> +				if (rv)
> +					break;
> +			}
> +			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
> +				siw_dbg_qp(qp, "no socket\n");
> +				rv = -EINVAL;
> +				break;
> +			}
> +			if (!(mask & SIW_QP_ATTR_MPA)) {
> +				siw_dbg_qp(qp, "no MPA\n");
> +				rv = -EINVAL;
> +				break;
> +			}
> +			siw_dbg_qp(qp, "enter rts, peer 0x%08x, loc 0x%08x\n",
> +				   qp->cep->llp.raddr.sin_addr.s_addr,
> +				   qp->cep->llp.laddr.sin_addr.s_addr);
> +			/*
> +			 * Initialize iWARP TX state
> +			 */
> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
> +
> +			/*
> +			 * Initialize iWARP RX state
> +			 */
> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
> +
> +			/*
> +			 * init IRD free queue, caller has already checked
> +			 * limits.
> +			 */
> +			rv = siw_qp_readq_init(qp, attrs->irq_size,
> +					       attrs->orq_size);
> +			if (rv)
> +				break;
> +
> +			qp->attrs.sk = attrs->sk;
> +			qp->attrs.state = SIW_QP_STATE_RTS;
> +
> +			break;
> +
> +		case SIW_QP_STATE_ERROR:
> +			siw_rq_flush(qp);
> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> +			if (qp->cep) {
> +				siw_cep_put(qp->cep);
> +				qp->cep = NULL;
> +			}
> +			break;
> +
> +		case SIW_QP_STATE_RTR:
> +			/* ignore */
> +			break;
> +
> +		default:
> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> +				   siw_qp_state_to_string[qp->attrs.state],
> +				   siw_qp_state_to_string[attrs->state]);
> +			break;
> +		}
> +		break;
> +
> +	case SIW_QP_STATE_RTS:
> +
> +		switch (attrs->state) {
> +
> +		case SIW_QP_STATE_CLOSING:
> +			/*
> +			 * Verbs: move to IDLE if SQ and ORQ are empty.
> +			 * Move to ERROR otherwise. But first of all we must
> +			 * close the connection. So we keep CLOSING or ERROR
> +			 * as a transient state, schedule connection drop work
> +			 * and wait for the socket state change upcall to
> +			 * come back closed.
> +			 */
> +			if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
> +				qp->attrs.state = SIW_QP_STATE_CLOSING;
> +			} else {
> +				qp->attrs.state = SIW_QP_STATE_ERROR;
> +				siw_sq_flush(qp);
> +			}
> +			siw_rq_flush(qp);
> +
> +			drop_conn = 1;
> +			break;
> +
> +		case SIW_QP_STATE_TERMINATE:
> +			qp->attrs.state = SIW_QP_STATE_TERMINATE;
> +
> +			siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
> +					   RDMAP_ETYPE_CATASTROPHIC,
> +					   RDMAP_ECODE_UNSPECIFIED, 1);
> +			drop_conn = 1;
> +
> +			break;
> +
> +		case SIW_QP_STATE_ERROR:
> +			/*
> +			 * This is an emergency close.
> +			 *
> +			 * Any in progress transmit operation will get
> +			 * cancelled.
> +			 * This will likely result in a protocol failure,
> +			 * if a TX operation is in transit. The caller
> +			 * could unconditional wait to give the current
> +			 * operation a chance to complete.
> +			 * Esp., how to handle the non-empty IRQ case?
> +			 * The peer was asking for data transfer at a valid
> +			 * point in time.
> +			 */
> +			siw_sq_flush(qp);
> +			siw_rq_flush(qp);
> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> +			drop_conn = 1;
> +
> +			break;
> +
> +		default:
> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> +				   siw_qp_state_to_string[qp->attrs.state],
> +				   siw_qp_state_to_string[attrs->state]);
> +			break;
> +		}
> +		break;
> +
> +	case SIW_QP_STATE_TERMINATE:
> +
> +		switch (attrs->state) {
> +
> +		case SIW_QP_STATE_ERROR:
> +			siw_rq_flush(qp);
> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> +
> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
> +				siw_sq_flush(qp);
> +
> +			break;
> +
> +		default:
> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> +				   siw_qp_state_to_string[qp->attrs.state],
> +				   siw_qp_state_to_string[attrs->state]);
> +		}
> +		break;
> +
> +	case SIW_QP_STATE_CLOSING:
> +
> +		switch (attrs->state) {
> +
> +		case SIW_QP_STATE_IDLE:
> +			WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
> +			qp->attrs.state = SIW_QP_STATE_IDLE;
> +
> +			break;
> +
> +		case SIW_QP_STATE_CLOSING:
> +			/*
> +			 * The LLP may already moved the QP to closing
> +			 * due to graceful peer close init
> +			 */
> +			break;
> +
> +		case SIW_QP_STATE_ERROR:
> +			/*
> +			 * QP was moved to CLOSING by LLP event
> +			 * not yet seen by user.
> +			 */
> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> +
> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
> +				siw_sq_flush(qp);
> +
> +			siw_rq_flush(qp);
> +
> +			break;
> +
> +		default:
> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> +				   siw_qp_state_to_string[qp->attrs.state],
> +				   siw_qp_state_to_string[attrs->state]);
> +
> +			return -ECONNABORTED;
> +		}
> +		break;
> +
> +	default:
> +		siw_dbg_qp(qp, " noop: state %s\n",
> +			   siw_qp_state_to_string[qp->attrs.state]);
> +		break;
> +	}
> +	if (drop_conn)
> +		siw_qp_cm_drop(qp, 0);
> +
> +	return rv;
> +}
> +
> +struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
> +{
> +	struct siw_qp *qp =  siw_qp_id2obj(to_siw_dev(base_dev), id);
> +
> +	if (qp) {
> +		/*
> +		 * siw_qp_id2obj() increments object reference count
> +		 */
> +		siw_qp_put(qp);
> +		siw_dbg_qp(qp, "got base QP");
> +
> +		return &qp->base_qp;
> +	}
> +	return (struct ib_qp *)NULL;
> +}
> +
> +/*
> + * siw_check_mem()
> + *
> + * Check protection domain, STAG state, access permissions and
> + * address range for memory object.
> + *
> + * @pd:		Protection Domain memory should belong to
> + * @mem:	memory to be checked
> + * @addr:	starting addr of mem
> + * @perms:	requested access permissions
> + * @len:	len of memory interval to be checked
> + *
> + */
> +int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem, u64 addr,
> +		  enum siw_access_flags perms, int len)
> +{
> +	if (siw_mem2mr(mem)->pd != pd) {
> +		siw_dbg(pd->hdr.sdev, "[PD %d]: pd mismatch\n", OBJ_ID(pd));
> +		return -E_PD_MISMATCH;
> +	}
> +	if (!mem->stag_valid) {
> +		siw_dbg(pd->hdr.sdev, "[PD %d]: stag 0x%08x invalid\n",
> +			OBJ_ID(pd), OBJ_ID(mem));
> +		return -E_STAG_INVALID;
> +	}
> +	/*
> +	 * check access permissions
> +	 */
> +	if ((mem->perms & perms) < perms) {
> +		siw_dbg(pd->hdr.sdev, "[PD %d]: permissions 0x%08x < 0x%08x\n",
> +			OBJ_ID(pd), mem->perms, perms);
> +		return -E_ACCESS_PERM;
> +	}
> +	/*
> +	 * Check address interval: we relax check to allow memory shrinked
> +	 * from the start address _after_ placing or fetching len bytes.
> +	 * TODO: this relaxation is probably overdone
> +	 */
> +	if (addr < mem->va || addr + len > mem->va + mem->len) {
> +		siw_dbg(pd->hdr.sdev, "[PD %d]: MEM interval len %d\n",
> +			OBJ_ID(pd), len);
> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx) out of bounds\n",
> +			(unsigned long long)addr,
> +			(unsigned long long)(addr + len));
> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx] LKey=0x%08x\n",
> +			(unsigned long long)mem->va,
> +			(unsigned long long)(mem->va + mem->len),
> +			OBJ_ID(mem));
> +
> +		return -E_BASE_BOUNDS;
> +	}
> +	return E_ACCESS_OK;
> +}
> +
> +/*
> + * siw_check_sge()
> + *
> + * Check SGE for access rights in given interval
> + *
> + * @pd:		Protection Domain memory should belong to
> + * @sge:	SGE to be checked
> + * @mem:	array of memory references
> + * @perms:	requested access permissions
> + * @off:	starting offset in SGE
> + * @len:	len of memory interval to be checked
> + *
> + * NOTE: Function references SGE's memory object (mem->obj)
> + * if not yet done. New reference is kept if check went ok and
> + * released if check failed. If mem->obj is already valid, no new
> + * lookup is being done and mem is not released it check fails.
> + */
> +int
> +siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
> +	      struct siw_mem *mem[], enum siw_access_flags perms,
> +	      u32 off, int len)
> +{
> +	struct siw_device *sdev = pd->hdr.sdev;
> +	int new_ref = 0, rv = E_ACCESS_OK;
> +
> +	if (len + off > sge->length) {
> +		rv = -E_BASE_BOUNDS;
> +		goto fail;
> +	}
> +	if (*mem == NULL) {
> +		*mem = siw_mem_id2obj(sdev, sge->lkey >> 8);
> +		if (*mem == NULL) {
> +			rv = -E_STAG_INVALID;
> +			goto fail;
> +		}
> +		new_ref = 1;
> +	}
> +
> +	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
> +	if (rv)
> +		goto fail;
> +
> +	return 0;
> +
> +fail:
> +	if (new_ref) {
> +		siw_mem_put(*mem);
> +		*mem = NULL;
> +	}
> +	return rv;
> +}
> +
> +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
> +{
> +	rreq->id = sqe->id;
> +	rreq->opcode = sqe->opcode;
> +	rreq->sge[0].laddr = sqe->sge[0].laddr;
> +	rreq->sge[0].length = sqe->sge[0].length;
> +	rreq->sge[0].lkey = sqe->sge[0].lkey;
> +	rreq->sge[1].lkey = sqe->sge[1].lkey;
> +	rreq->flags = sqe->flags | SIW_WQE_VALID;
> +	rreq->num_sge = 1;
> +}
> +
> +/*
> + * Must be called with SQ locked.
> + * To avoid complete SQ starvation by constant inbound READ requests,
> + * the active IRQ will not be served after qp->irq_burst, if the
> + * SQ has pending work.
> + */
> +int siw_activate_tx(struct siw_qp *qp)
> +{
> +	struct siw_sqe	*irqe, *sqe;
> +	struct siw_wqe	*wqe = tx_wqe(qp);
> +	int rv = 1;
> +
> +	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
> +
> +	if (irqe->flags & SIW_WQE_VALID) {
> +		sqe = sq_get_next(qp);
> +
> +		/*
> +		 * Avoid local WQE processing starvation in case
> +		 * of constant inbound READ request stream
> +		 */
> +		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
> +			qp->irq_burst = 0;
> +			goto skip_irq;
> +		}
> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> +		wqe->wr_status = SIW_WR_QUEUED;
> +
> +		/* start READ RESPONSE */
> +		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
> +		wqe->sqe.flags = 0;
> +		if (irqe->num_sge) {
> +			wqe->sqe.num_sge = 1;
> +			wqe->sqe.sge[0].length = irqe->sge[0].length;
> +			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
> +			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
> +		} else {
> +			wqe->sqe.num_sge = 0;
> +		}
> +
> +		/* Retain original RREQ's message sequence number for
> +		 * potential error reporting cases.
> +		 */
> +		wqe->sqe.sge[1].length = irqe->sge[1].length;
> +
> +		wqe->sqe.rkey = irqe->rkey;
> +		wqe->sqe.raddr = irqe->raddr;
> +
> +		wqe->processed = 0;
> +		qp->irq_get++;
> +
> +		/* mark current IRQ entry free */
> +		smp_store_mb(irqe->flags, 0);
> +
> +		goto out;
> +	}
> +
> +	sqe = sq_get_next(qp);
> +	if (sqe) {
> +skip_irq:
> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> +		wqe->wr_status = SIW_WR_QUEUED;
> +
> +		/* First copy SQE to kernel private memory */
> +		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
> +
> +		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
> +			rv = -EINVAL;
> +			goto out;
> +		}
> +		if (wqe->sqe.flags & SIW_WQE_INLINE) {
> +			if (wqe->sqe.opcode != SIW_OP_SEND &&
> +			    wqe->sqe.opcode != SIW_OP_WRITE) {
> +				rv = -EINVAL;
> +				goto out;
> +			}
> +			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
> +				rv = -EINVAL;
> +				goto out;
> +			}
> +			wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
> +			wqe->sqe.sge[0].lkey = 0;
> +			wqe->sqe.num_sge = 1;
> +		}
> +		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
> +			/* A READ cannot be fenced */
> +			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
> +			    wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV)) {
> +				siw_dbg_qp(qp, "cannot fence read\n");
> +				rv = -EINVAL;
> +				goto out;
> +			}
> +			spin_lock(&qp->orq_lock);
> +
> +			if (!siw_orq_empty(qp)) {
> +				qp->tx_ctx.orq_fence = 1;
> +				rv = 0;
> +			}
> +			spin_unlock(&qp->orq_lock);
> +
> +		} else if (wqe->sqe.opcode == SIW_OP_READ ||
> +			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
> +			struct siw_sqe	*rreq;
> +
> +			wqe->sqe.num_sge = 1;
> +
> +			spin_lock(&qp->orq_lock);
> +
> +			rreq = orq_get_free(qp);
> +			if (rreq) {
> +				/*
> +				 * Make an immediate copy in ORQ to be ready
> +				 * to process loopback READ reply
> +				 */
> +				siw_read_to_orq(rreq, &wqe->sqe);
> +				qp->orq_put++;
> +			} else {
> +				qp->tx_ctx.orq_fence = 1;
> +				rv = 0;
> +			}
> +			spin_unlock(&qp->orq_lock);
> +		}
> +
> +		/* Clear SQE, can be re-used by application */
> +		smp_store_mb(sqe->flags, 0);
> +		qp->sq_get++;
> +	} else {
> +		rv = 0;
> +	}
> +out:
> +	if (unlikely(rv < 0)) {
> +		siw_dbg_qp(qp, "error %d\n", rv);
> +		wqe->wr_status = SIW_WR_IDLE;
> +	}
> +	return rv;
> +}
> +
> +/*
> + * Check if current CQ state qualifies for
> + * calling CQ completion handler. Must be
> + * called with CQ lock held.
> + */
> +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
> +{
> +	u64 cq_notify;
> +
> +	if (!cq->base_cq.comp_handler)
> +		return false;
> +
> +	cq_notify = READ_ONCE(*cq->notify);
> +
> +	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
> +	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
> +	     (flags & SIW_WQE_SOLICITED))) {
> +		/* dis-arm CQ */
> +		smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
> +
> +		return true;
> +	}
> +	return false;
> +}
> +
> +/* Must be called without holding CQ lock */
> +static inline void siw_cq_completion(struct siw_cq *cq)
> +{
> +	siw_dbg_obj(cq, "Completion\n");
> +	(*cq->base_cq.comp_handler)(&cq->base_cq, cq->base_cq.cq_context);
> +}
> +
> +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
> +		     enum siw_wc_status status)
> +{
> +	struct siw_cq *cq = qp->scq;
> +	int rv = 0;
> +
> +	if (cq) {
> +		u32 sqe_flags = sqe->flags;
> +		struct siw_cqe *cqe;
> +		u32 idx;
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&cq->lock, flags);
> +
> +		idx = cq->cq_put % cq->num_cqe;
> +		cqe = &cq->queue[idx];
> +
> +		if (!READ_ONCE(cqe->flags)) {
> +			bool notify;
> +
> +			cqe->id = sqe->id;
> +			cqe->opcode = sqe->opcode;
> +			cqe->status = status;
> +			cqe->imm_data = 0;
> +			cqe->bytes = bytes;
> +
> +			if (cq->kernel_verbs) {

kernel_verbs is managed by Ib/core, why should driver know about it?

Thanks
Bernard Metzler Feb. 27, 2019, 11:54 a.m. UTC | #2
-----"Leon Romanovsky" <leon@kernel.org> wrote: -----

>To: "Bernard Metzler" <bmt@zurich.ibm.com>
>From: "Leon Romanovsky" <leon@kernel.org>
>Date: 02/24/2019 02:19PM
>Cc: linux-rdma@vger.kernel.org
>Subject: Re: [PATCH v5 08/13] SIW queue pair methods
>
>On Tue, Feb 19, 2019 at 11:08:58AM +0100, Bernard Metzler wrote:
>> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
>> ---
>>  drivers/infiniband/sw/siw/siw_qp.c | 1478
>++++++++++++++++++++++++++++
>>  1 file changed, 1478 insertions(+)
>>  create mode 100644 drivers/infiniband/sw/siw/siw_qp.c
>>
>> diff --git a/drivers/infiniband/sw/siw/siw_qp.c
>b/drivers/infiniband/sw/siw/siw_qp.c
>> new file mode 100644
>> index 000000000000..75fd151dae39
>> --- /dev/null
>> +++ b/drivers/infiniband/sw/siw/siw_qp.c
>> @@ -0,0 +1,1478 @@
>> +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
>> +/*
>> + * Software iWARP device driver
>> + *
>> + * Authors: Bernard Metzler <bmt@zurich.ibm.com>
>> + *
>> + * Copyright (c) 2008-2018, IBM Corporation
>> + *
>> + * This software is available to you under a choice of one of two
>> + * licenses. You may choose to be licensed under the terms of the
>GNU
>> + * General Public License (GPL) Version 2, available from the file
>> + * COPYING in the main directory of this source tree, or the
>> + * BSD license below:
>> + *
>> + *   Redistribution and use in source and binary forms, with or
>> + *   without modification, are permitted provided that the
>following
>> + *   conditions are met:
>> + *
>> + *   - Redistributions of source code must retain the above
>copyright notice,
>> + *     this list of conditions and the following disclaimer.
>> + *
>> + *   - Redistributions in binary form must reproduce the above
>copyright
>> + *     notice, this list of conditions and the following
>disclaimer in the
>> + *     documentation and/or other materials provided with the
>distribution.
>> + *
>> + *   - Neither the name of IBM nor the names of its contributors
>may be
>> + *     used to endorse or promote products derived from this
>software without
>> + *     specific prior written permission.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
>OF
>> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
>> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
>HOLDERS
>> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
>AN
>> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
>IN
>> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>THE
>> + * SOFTWARE.
>> + */
>> +
>> +#include <linux/errno.h>
>> +#include <linux/types.h>
>> +#include <linux/net.h>
>> +#include <linux/file.h>
>> +#include <linux/scatterlist.h>
>> +#include <linux/highmem.h>
>> +#include <linux/vmalloc.h>
>> +#include <asm/barrier.h>
>> +#include <net/sock.h>
>> +#include <net/tcp_states.h>
>> +#include <net/tcp.h>
>> +
>> +#include <rdma/iw_cm.h>
>> +#include <rdma/ib_verbs.h>
>> +#include <rdma/ib_smi.h>
>> +#include <rdma/ib_user_verbs.h>
>> +
>> +#include "siw.h"
>> +#include "siw_obj.h"
>> +#include "siw_cm.h"
>> +
>> +static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof
>"TERMINATE"] = {
>> +	[SIW_QP_STATE_IDLE]		= "IDLE",
>> +	[SIW_QP_STATE_RTR]		= "RTR",
>> +	[SIW_QP_STATE_RTS]		= "RTS",
>> +	[SIW_QP_STATE_CLOSING]		= "CLOSING",
>> +	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
>> +	[SIW_QP_STATE_ERROR]		= "ERROR"
>> +};
>> +
>> +/*
>> + * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp
>settings on a
>> + * per-RDMAP message basis. Please keep order of initializer. All
>MPA len
>> + * is initialized to minimum packet size.
>> + */
>> +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { {
>> +	/* RDMAP_RDMA_WRITE */
>> +	.hdr_len = sizeof(struct iwarp_rdma_write),
>> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
>> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
>> +		| cpu_to_be16(DDP_VERSION << 8)
>> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> +		| cpu_to_be16(RDMAP_RDMA_WRITE),
>> +	.proc_data = siw_proc_write
>> +},
>> +{	/* RDMAP_RDMA_READ_REQ */
>> +	.hdr_len = sizeof(struct iwarp_rdma_rreq),
>> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
>> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> +		| cpu_to_be16(DDP_VERSION << 8)
>> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> +		| cpu_to_be16(RDMAP_RDMA_READ_REQ),
>> +	.proc_data = siw_proc_rreq
>> +},
>> +{	/* RDMAP_RDMA_READ_RESP */
>> +	.hdr_len = sizeof(struct iwarp_rdma_rresp),
>> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
>> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
>> +		| cpu_to_be16(DDP_VERSION << 8)
>> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> +		| cpu_to_be16(RDMAP_RDMA_READ_RESP),
>> +	.proc_data = siw_proc_rresp
>> +},
>> +{	/* RDMAP_SEND */
>> +	.hdr_len = sizeof(struct iwarp_send),
>> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
>> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> +		| cpu_to_be16(DDP_VERSION << 8)
>> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> +		| cpu_to_be16(RDMAP_SEND),
>> +	.proc_data = siw_proc_send
>> +},
>> +{	/* RDMAP_SEND_INVAL */
>> +	.hdr_len = sizeof(struct iwarp_send_inv),
>> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
>> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> +		| cpu_to_be16(DDP_VERSION << 8)
>> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> +		| cpu_to_be16(RDMAP_SEND_INVAL),
>> +	.proc_data = siw_proc_send
>> +},
>> +{	/* RDMAP_SEND_SE */
>> +	.hdr_len = sizeof(struct iwarp_send),
>> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
>> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> +		| cpu_to_be16(DDP_VERSION << 8)
>> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> +		| cpu_to_be16(RDMAP_SEND_SE),
>> +	.proc_data = siw_proc_send
>> +},
>> +{	/* RDMAP_SEND_SE_INVAL */
>> +	.hdr_len = sizeof(struct iwarp_send_inv),
>> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
>> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> +		| cpu_to_be16(DDP_VERSION << 8)
>> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> +		| cpu_to_be16(RDMAP_SEND_SE_INVAL),
>> +	.proc_data = siw_proc_send
>> +},
>> +{	/* RDMAP_TERMINATE */
>> +	.hdr_len = sizeof(struct iwarp_terminate),
>> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
>> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> +		| cpu_to_be16(DDP_VERSION << 8)
>> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> +		| cpu_to_be16(RDMAP_TERMINATE),
>> +	.proc_data = siw_proc_terminate
>> +} };
>> +
>> +void siw_qp_llp_data_ready(struct sock *sk)
>> +{
>> +	struct siw_qp		*qp;
>> +
>> +	read_lock(&sk->sk_callback_lock);
>> +
>> +	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
>> +		goto done;
>> +
>> +	qp = sk_to_qp(sk);
>> +
>> +	if (likely(!qp->rx_ctx.rx_suspend &&
>> +		   down_read_trylock(&qp->state_lock))) {
>> +		read_descriptor_t rd_desc = {.arg.data = qp, .count = 1};
>> +
>> +		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
>> +			/*
>> +			 * Implements data receive operation during
>> +			 * socket callback. TCP gracefully catches
>> +			 * the case where there is nothing to receive
>> +			 * (not calling siw_tcp_rx_data() then).
>> +			 */
>> +			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
>> +
>> +		up_read(&qp->state_lock);
>> +	} else {
>> +		siw_dbg_qp(qp, "unable to rx, suspend: %d\n",
>> +			   qp->rx_ctx.rx_suspend);
>> +	}
>> +done:
>> +	read_unlock(&sk->sk_callback_lock);
>> +}
>> +
>> +void siw_qp_llp_close(struct siw_qp *qp)
>> +{
>> +	siw_dbg_qp(qp, "enter llp close, state = %s\n",
>> +		   siw_qp_state_to_string[qp->attrs.state]);
>> +
>> +	down_write(&qp->state_lock);
>> +
>> +	qp->rx_ctx.rx_suspend = 1;
>> +	qp->tx_ctx.tx_suspend = 1;
>> +	qp->attrs.sk = NULL;
>> +
>> +	switch (qp->attrs.state) {
>> +
>> +	case SIW_QP_STATE_RTS:
>> +	case SIW_QP_STATE_RTR:
>> +	case SIW_QP_STATE_IDLE:
>> +	case SIW_QP_STATE_TERMINATE:
>> +
>> +		qp->attrs.state = SIW_QP_STATE_ERROR;
>> +
>> +		break;
>> +	/*
>> +	 * SIW_QP_STATE_CLOSING:
>> +	 *
>> +	 * This is a forced close. shall the QP be moved to
>> +	 * ERROR or IDLE ?
>> +	 */
>> +	case SIW_QP_STATE_CLOSING:
>> +		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
>> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> +		else
>> +			qp->attrs.state = SIW_QP_STATE_IDLE;
>> +
>> +		break;
>> +
>> +	default:
>> +		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
>> +			   siw_qp_state_to_string[qp->attrs.state]);
>> +		break;
>> +	}
>> +	siw_sq_flush(qp);
>> +	siw_rq_flush(qp);
>> +
>> +	/*
>> +	 * Dereference closing CEP
>> +	 */
>> +	if (qp->cep) {
>> +		siw_cep_put(qp->cep);
>> +		qp->cep = NULL;
>> +	}
>> +
>> +	up_write(&qp->state_lock);
>> +
>> +	siw_dbg_qp(qp, "llp close exit: state %s\n",
>> +		   siw_qp_state_to_string[qp->attrs.state]);
>> +}
>> +
>> +/*
>> + * socket callback routine informing about newly available send
>space.
>> + * Function schedules SQ work for processing SQ items.
>> + */
>> +void siw_qp_llp_write_space(struct sock *sk)
>> +{
>> +	struct siw_cep	*cep = sk_to_cep(sk);
>> +
>> +	cep->sk_write_space(sk);
>> +
>> +	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
>> +		(void) siw_sq_start(cep->qp);
>> +}
>> +
>> +static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int
>orq_size)
>> +{
>> +	if (!irq_size)
>> +		irq_size = 1;
>> +	if (!orq_size)
>> +		orq_size = 1;
>> +
>> +	qp->attrs.irq_size = irq_size;
>> +	qp->attrs.orq_size = orq_size;
>> +
>> +	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
>> +	if (!qp->irq) {
>> +		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
>> +		qp->attrs.irq_size = 0;
>> +		return -ENOMEM;
>> +	}
>> +	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
>> +	if (!qp->orq) {
>> +		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
>> +		qp->attrs.orq_size = 0;
>> +		qp->attrs.irq_size = 0;
>> +		vfree(qp->irq);
>> +		return -ENOMEM;
>> +	}
>> +	return 0;
>> +}
>> +
>> +static int siw_qp_enable_crc(struct siw_qp *qp)
>> +{
>> +	struct siw_iwarp_rx *c_rx = &qp->rx_ctx;
>> +	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
>> +	int rv = 0;
>> +
>> +	if (siw_crypto_shash == NULL) {
>> +		rv = -ENOENT;
>> +		goto error;
>> +	}
>> +	c_tx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
>> +				   crypto_shash_descsize(siw_crypto_shash),
>> +				   GFP_KERNEL);
>> +	c_rx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
>> +				   crypto_shash_descsize(siw_crypto_shash),
>> +				   GFP_KERNEL);
>> +	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
>> +		rv = -ENOMEM;
>> +		goto error;
>> +	}
>> +	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
>> +	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
>> +
>> +	return 0;
>> +error:
>> +	siw_dbg_qp(qp, "falied loading crc32c. error %d\n", rv);
>> +
>> +	kfree(c_tx->mpa_crc_hd);
>> +	kfree(c_rx->mpa_crc_hd);
>> +
>> +	c_tx->mpa_crc_hd = c_rx->mpa_crc_hd = NULL;
>> +
>> +	return rv;
>> +}
>> +
>> +/*
>> + * Send a non signalled READ or WRITE to peer side as negotiated
>> + * with MPAv2 P2P setup protocol. The work request is only created
>> + * as a current active WR and does not consume Send Queue space.
>> + *
>> + * Caller must hold QP state lock.
>> + */
>> +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
>> +{
>> +	struct siw_wqe	*wqe = tx_wqe(qp);
>> +	unsigned long flags;
>> +	int rv = 0;
>> +
>> +	spin_lock_irqsave(&qp->sq_lock, flags);
>> +
>> +	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
>> +		spin_unlock_irqrestore(&qp->sq_lock, flags);
>> +		return -EIO;
>> +	}
>> +	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> +
>> +	wqe->wr_status = SIW_WR_QUEUED;
>> +	wqe->sqe.flags = 0;
>> +	wqe->sqe.num_sge = 1;
>> +	wqe->sqe.sge[0].length = 0;
>> +	wqe->sqe.sge[0].laddr = 0;
>> +	wqe->sqe.sge[0].lkey = 0;
>> +	/*
>> +	 * While it must not be checked for inbound zero length
>> +	 * READ/WRITE, some HW may treat STag 0 special.
>> +	 */
>> +	wqe->sqe.rkey = 1;
>> +	wqe->sqe.raddr = 0;
>> +	wqe->processed = 0;
>> +
>> +	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
>> +		wqe->sqe.opcode = SIW_OP_WRITE;
>> +	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
>> +		struct siw_sqe	*rreq;
>> +
>> +		wqe->sqe.opcode = SIW_OP_READ;
>> +
>> +		spin_lock(&qp->orq_lock);
>> +
>> +		rreq = orq_get_free(qp);
>> +		if (rreq) {
>> +			siw_read_to_orq(rreq, &wqe->sqe);
>> +			qp->orq_put++;
>> +		} else
>> +			rv = -EIO;
>> +
>> +		spin_unlock(&qp->orq_lock);
>> +	} else
>> +		rv = -EINVAL;
>> +
>> +	if (rv)
>> +		wqe->wr_status = SIW_WR_IDLE;
>> +
>> +	spin_unlock_irqrestore(&qp->sq_lock, flags);
>> +
>> +	if (!rv)
>> +		rv = siw_sq_start(qp);
>> +
>> +	return rv;
>> +}
>> +
>> +/*
>> + * Map memory access error to DDP tagged error
>> + */
>> +enum ddp_ecode siw_tagged_error(enum siw_access_state state)
>> +{
>> +	if (state == E_STAG_INVALID)
>> +		return DDP_ECODE_T_INVALID_STAG;
>> +	if (state == E_BASE_BOUNDS)
>> +		return DDP_ECODE_T_BASE_BOUNDS;
>> +	if (state == E_PD_MISMATCH)
>> +		return DDP_ECODE_T_STAG_NOT_ASSOC;
>> +	if (state == E_ACCESS_PERM)
>> +		/*
>> +		 * RFC 5041 (DDP) lacks an ecode for insufficient access
>> +		 * permissions. 'Invalid STag' seem to be the closest
>> +		 * match though.
>> +		 */
>> +		return DDP_ECODE_T_INVALID_STAG;
>> +
>> +	WARN_ON(1);
>> +
>> +	return DDP_ECODE_T_INVALID_STAG;
>> +}
>> +
>> +/*
>> + * Map memory access error to RDMAP protection error
>> + */
>> +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
>> +{
>> +	if (state == E_STAG_INVALID)
>> +		return RDMAP_ECODE_INVALID_STAG;
>> +	if (state == E_BASE_BOUNDS)
>> +		return RDMAP_ECODE_BASE_BOUNDS;
>> +	if (state == E_PD_MISMATCH)
>> +		return RDMAP_ECODE_STAG_NOT_ASSOC;
>> +	if (state == E_ACCESS_PERM)
>> +		return RDMAP_ECODE_ACCESS_RIGHTS;
>> +
>> +	return RDMAP_ECODE_UNSPECIFIED;
>> +}
>> +
>> +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer,
>> +			u8 etype, u8 ecode, int in_tx)
>> +{
>> +	if (!qp->term_info.valid) {
>> +		memset(&qp->term_info, 0, sizeof(qp->term_info));
>> +		qp->term_info.layer = layer;
>> +		qp->term_info.etype = etype;
>> +		qp->term_info.ecode = ecode;
>> +		qp->term_info.in_tx = in_tx;
>> +		qp->term_info.valid = 1;
>> +	}
>> +	siw_dbg_qp(qp,
>> +		   "init TERM: layer %d, type %d, code %d, in tx %s\n",
>> +		   layer, etype, ecode, in_tx ? "yes" : "no");
>> +}
>> +
>> +/*
>> + * Send a TERMINATE message, as defined in RFC's
>5040/5041/5044/6581.
>> + * Sending TERMINATE messages is best effort - such messages
>> + * can only be send if the QP is still connected and it does
>> + * not have another outbound message in-progress, i.e. the
>> + * TERMINATE message must not interfer with an incomplete current
>> + * transmit operation.
>> + */
>> +void siw_send_terminate(struct siw_qp *qp)
>> +{
>> +	struct kvec		iov[3];
>> +	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT|MSG_EOR};
>> +	struct iwarp_terminate	*term = NULL;
>> +	union iwarp_hdr		*err_hdr = NULL;
>> +	struct socket		*s = qp->attrs.sk;
>> +	struct siw_iwarp_rx	*rx_ctx = &qp->rx_ctx;
>> +	union iwarp_hdr		*rx_hdr = &rx_ctx->hdr;
>> +	u32 crc = 0;
>> +	int num_frags, len_terminate, rv;
>> +
>> +	if (!qp->term_info.valid)
>> +		return;
>> +
>> +	qp->term_info.valid = 0;
>> +
>> +	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
>> +		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
>> +			   tx_type(tx_wqe(qp)));
>> +		return;
>> +	}
>> +	if (!s && qp->cep)
>> +		/* QP not yet in RTS. Take socket from connection end point */
>> +		s = qp->cep->llp.sock;
>> +
>> +	if (!s) {
>> +		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
>> +		return;
>> +	}
>> +
>> +	term = kzalloc(sizeof(*term), GFP_KERNEL);
>> +	if (!term)
>> +		return;
>> +
>> +	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
>> +	term->ddp_mo = 0;
>> +	term->ddp_msn = cpu_to_be32(1);
>> +
>> +	iov[0].iov_base = term;
>> +	iov[0].iov_len = sizeof(*term);
>> +
>> +	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
>> +	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
>> +	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
>> +		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
>> +		if (!err_hdr) {
>> +			kfree(term);
>> +			return;
>> +		}
>> +	}
>> +	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
>> +	       sizeof(struct iwarp_ctrl));
>> +
>> +	__rdmap_term_set_layer(term, qp->term_info.layer);
>> +	__rdmap_term_set_etype(term, qp->term_info.etype);
>> +	__rdmap_term_set_ecode(term, qp->term_info.ecode);
>> +
>> +	switch (qp->term_info.layer) {
>> +
>> +	case TERM_ERROR_LAYER_RDMAP:
>> +		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
>> +			/* No additional DDP/RDMAP header to be included */
>> +			break;
>> +
>> +		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
>> +			/*
>> +			 * Complete RDMAP frame will get attached, and
>> +			 * DDP segment length is valid
>> +			 */
>> +			term->flag_m = 1;
>> +			term->flag_d = 1;
>> +			term->flag_r = 1;
>> +
>> +			if (qp->term_info.in_tx) {
>> +				struct iwarp_rdma_rreq *rreq;
>> +				struct siw_wqe *wqe = tx_wqe(qp);
>> +
>> +				/* Inbound RREQ error, detected during
>> +				 * RRESP creation. Take state from
>> +				 * current TX work queue element to
>> +				 * reconstruct peers RREQ.
>> +				 */
>> +				rreq = (struct iwarp_rdma_rreq *)err_hdr;
>> +
>> +				memcpy(&rreq->ctrl,
>> +				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
>> +				       sizeof(struct iwarp_ctrl));
>> +
>> +				rreq->rsvd = 0;
>> +				rreq->ddp_qn =
>> +					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
>> +
>> +				/* Provide RREQ's MSN as kept aside */
>> +				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
>> +
>> +				rreq->ddp_mo = htonl(wqe->processed);
>> +				rreq->sink_stag = htonl(wqe->sqe.rkey);
>> +				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
>> +				rreq->read_size = htonl(wqe->sqe.sge[0].length);
>> +				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
>> +				rreq->source_to =
>> +					cpu_to_be64(wqe->sqe.sge[0].laddr);
>> +
>> +				iov[1].iov_base = rreq;
>> +				iov[1].iov_len = sizeof(*rreq);
>> +
>> +				rx_hdr = (union iwarp_hdr *)rreq;
>> +			} else {
>> +				/* Take RDMAP/DDP information from
>> +				 * current (failed) inbound frame.
>> +				 */
>> +				iov[1].iov_base = rx_hdr;
>> +
>> +				if (__rdmap_opcode(&rx_hdr->ctrl) ==
>> +				    RDMAP_RDMA_READ_REQ)
>> +					iov[1].iov_len =
>> +						sizeof(struct iwarp_rdma_rreq);
>> +				else /* SEND type */
>> +					iov[1].iov_len =
>> +						sizeof(struct iwarp_send);
>> +			}
>> +		} else {
>> +			/* Do not report DDP hdr information if packet
>> +			 * layout is unknown
>> +			 */
>> +			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
>> +			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
>> +				break;
>> +
>> +			iov[1].iov_base = rx_hdr;
>> +
>> +			/* Only DDP frame will get attached */
>> +			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
>> +				iov[1].iov_len =
>> +					sizeof(struct iwarp_rdma_write);
>> +			else
>> +				iov[1].iov_len = sizeof(struct iwarp_send);
>> +
>> +			term->flag_m = 1;
>> +			term->flag_d = 1;
>> +		}
>> +		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
>> +
>> +		break;
>> +
>> +	case TERM_ERROR_LAYER_DDP:
>> +		/* Report error encountered while DDP processing.
>> +		 * This can only happen as a result of inbound
>> +		 * DDP processing
>> +		 */
>> +
>> +		/* Do not report DDP hdr information if packet
>> +		 * layout is unknown
>> +		 */
>> +		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
>> +		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
>> +		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
>> +		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
>> +			break;
>> +
>> +		iov[1].iov_base = rx_hdr;
>> +
>> +		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
>> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
>> +		else
>> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
>> +
>> +		term->flag_m = 1;
>> +		term->flag_d = 1;
>> +
>> +		break;
>> +
>> +	default:
>> +		break;
>> +
>> +	}
>> +	if (term->flag_m || term->flag_d || term->flag_r) {
>> +		iov[2].iov_base = &crc;
>> +		iov[2].iov_len = sizeof(crc);
>> +		len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
>> +		num_frags = 3;
>> +	} else {
>> +		iov[1].iov_base = &crc;
>> +		iov[1].iov_len = sizeof(crc);
>> +		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
>> +		num_frags = 2;
>> +	}
>> +
>> +	/* Adjust DDP Segment Length parameter, if valid */
>> +	if (term->flag_m) {
>> +		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
>> +		enum rdma_opcode op = __rdmap_opcode(&rx_hdr->ctrl);
>> +
>> +		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
>> +		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
>> +	}
>> +
>> +	term->ctrl.mpa_len = cpu_to_be16(len_terminate -
>> +					 (MPA_HDR_SIZE + MPA_CRC_SIZE));
>> +	if (qp->tx_ctx.mpa_crc_hd) {
>> +		crypto_shash_init(rx_ctx->mpa_crc_hd);
>> +		if (siw_crc_array(rx_ctx->mpa_crc_hd, (u8 *)iov[0].iov_base,
>> +				  iov[0].iov_len))
>> +			goto out;
>> +
>> +		if (num_frags == 3) {
>> +			if (siw_crc_array(rx_ctx->mpa_crc_hd,
>> +					  (u8 *)iov[1].iov_base,
>> +					  iov[1].iov_len))
>> +				goto out;
>> +		}
>> +		crypto_shash_final(rx_ctx->mpa_crc_hd, (u8 *)&crc);
>> +	}
>> +
>> +	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
>> +	siw_dbg_qp(qp,
>> +		   "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
>> +		   rv == len_terminate ? "success" : "failure",
>> +		   __rdmap_term_layer(term), __rdmap_term_etype(term),
>> +		   __rdmap_term_ecode(term), rv);
>> +out:
>> +	kfree(term);
>> +	kfree(err_hdr);
>> +}
>> +
>> +/*
>> + * handle all attrs other than state
>> + */
>> +static void siw_qp_modify_nonstate(struct siw_qp *qp,
>> +				   struct siw_qp_attrs *attrs,
>> +				   enum siw_qp_attr_mask mask)
>> +{
>> +	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
>> +		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
>> +			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
>> +		else
>> +			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
>> +
>> +		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
>> +			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
>> +		else
>> +			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
>> +
>> +		if (attrs->flags & SIW_RDMA_READ_ENABLED)
>> +			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
>> +		else
>> +			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
>> +	}
>> +}
>> +
>> +/*
>> + * caller holds qp->state_lock
>> + */
>> +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
>> +		  enum siw_qp_attr_mask mask)
>> +{
>> +	int	drop_conn = 0, rv = 0;
>> +
>> +	if (!mask)
>> +		return 0;
>> +
>> +	siw_dbg_qp(qp, "state: %s => %s\n",
>> +		   siw_qp_state_to_string[qp->attrs.state],
>> +		   siw_qp_state_to_string[attrs->state]);
>> +
>> +	if (mask != SIW_QP_ATTR_STATE)
>> +		siw_qp_modify_nonstate(qp, attrs, mask);
>> +
>> +	if (!(mask & SIW_QP_ATTR_STATE))
>> +		return 0;
>> +
>> +	switch (qp->attrs.state) {
>> +
>> +	case SIW_QP_STATE_IDLE:
>> +	case SIW_QP_STATE_RTR:
>> +
>> +		switch (attrs->state) {
>> +
>> +		case SIW_QP_STATE_RTS:
>> +
>> +			if (attrs->flags & SIW_MPA_CRC) {
>> +				rv = siw_qp_enable_crc(qp);
>> +				if (rv)
>> +					break;
>> +			}
>> +			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
>> +				siw_dbg_qp(qp, "no socket\n");
>> +				rv = -EINVAL;
>> +				break;
>> +			}
>> +			if (!(mask & SIW_QP_ATTR_MPA)) {
>> +				siw_dbg_qp(qp, "no MPA\n");
>> +				rv = -EINVAL;
>> +				break;
>> +			}
>> +			siw_dbg_qp(qp, "enter rts, peer 0x%08x, loc 0x%08x\n",
>> +				   qp->cep->llp.raddr.sin_addr.s_addr,
>> +				   qp->cep->llp.laddr.sin_addr.s_addr);
>> +			/*
>> +			 * Initialize iWARP TX state
>> +			 */
>> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
>> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
>> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
>> +
>> +			/*
>> +			 * Initialize iWARP RX state
>> +			 */
>> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
>> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
>> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
>> +
>> +			/*
>> +			 * init IRD free queue, caller has already checked
>> +			 * limits.
>> +			 */
>> +			rv = siw_qp_readq_init(qp, attrs->irq_size,
>> +					       attrs->orq_size);
>> +			if (rv)
>> +				break;
>> +
>> +			qp->attrs.sk = attrs->sk;
>> +			qp->attrs.state = SIW_QP_STATE_RTS;
>> +
>> +			break;
>> +
>> +		case SIW_QP_STATE_ERROR:
>> +			siw_rq_flush(qp);
>> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> +			if (qp->cep) {
>> +				siw_cep_put(qp->cep);
>> +				qp->cep = NULL;
>> +			}
>> +			break;
>> +
>> +		case SIW_QP_STATE_RTR:
>> +			/* ignore */
>> +			break;
>> +
>> +		default:
>> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> +				   siw_qp_state_to_string[qp->attrs.state],
>> +				   siw_qp_state_to_string[attrs->state]);
>> +			break;
>> +		}
>> +		break;
>> +
>> +	case SIW_QP_STATE_RTS:
>> +
>> +		switch (attrs->state) {
>> +
>> +		case SIW_QP_STATE_CLOSING:
>> +			/*
>> +			 * Verbs: move to IDLE if SQ and ORQ are empty.
>> +			 * Move to ERROR otherwise. But first of all we must
>> +			 * close the connection. So we keep CLOSING or ERROR
>> +			 * as a transient state, schedule connection drop work
>> +			 * and wait for the socket state change upcall to
>> +			 * come back closed.
>> +			 */
>> +			if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
>> +				qp->attrs.state = SIW_QP_STATE_CLOSING;
>> +			} else {
>> +				qp->attrs.state = SIW_QP_STATE_ERROR;
>> +				siw_sq_flush(qp);
>> +			}
>> +			siw_rq_flush(qp);
>> +
>> +			drop_conn = 1;
>> +			break;
>> +
>> +		case SIW_QP_STATE_TERMINATE:
>> +			qp->attrs.state = SIW_QP_STATE_TERMINATE;
>> +
>> +			siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
>> +					   RDMAP_ETYPE_CATASTROPHIC,
>> +					   RDMAP_ECODE_UNSPECIFIED, 1);
>> +			drop_conn = 1;
>> +
>> +			break;
>> +
>> +		case SIW_QP_STATE_ERROR:
>> +			/*
>> +			 * This is an emergency close.
>> +			 *
>> +			 * Any in progress transmit operation will get
>> +			 * cancelled.
>> +			 * This will likely result in a protocol failure,
>> +			 * if a TX operation is in transit. The caller
>> +			 * could unconditional wait to give the current
>> +			 * operation a chance to complete.
>> +			 * Esp., how to handle the non-empty IRQ case?
>> +			 * The peer was asking for data transfer at a valid
>> +			 * point in time.
>> +			 */
>> +			siw_sq_flush(qp);
>> +			siw_rq_flush(qp);
>> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> +			drop_conn = 1;
>> +
>> +			break;
>> +
>> +		default:
>> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> +				   siw_qp_state_to_string[qp->attrs.state],
>> +				   siw_qp_state_to_string[attrs->state]);
>> +			break;
>> +		}
>> +		break;
>> +
>> +	case SIW_QP_STATE_TERMINATE:
>> +
>> +		switch (attrs->state) {
>> +
>> +		case SIW_QP_STATE_ERROR:
>> +			siw_rq_flush(qp);
>> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> +
>> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
>> +				siw_sq_flush(qp);
>> +
>> +			break;
>> +
>> +		default:
>> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> +				   siw_qp_state_to_string[qp->attrs.state],
>> +				   siw_qp_state_to_string[attrs->state]);
>> +		}
>> +		break;
>> +
>> +	case SIW_QP_STATE_CLOSING:
>> +
>> +		switch (attrs->state) {
>> +
>> +		case SIW_QP_STATE_IDLE:
>> +			WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
>> +			qp->attrs.state = SIW_QP_STATE_IDLE;
>> +
>> +			break;
>> +
>> +		case SIW_QP_STATE_CLOSING:
>> +			/*
>> +			 * The LLP may already moved the QP to closing
>> +			 * due to graceful peer close init
>> +			 */
>> +			break;
>> +
>> +		case SIW_QP_STATE_ERROR:
>> +			/*
>> +			 * QP was moved to CLOSING by LLP event
>> +			 * not yet seen by user.
>> +			 */
>> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> +
>> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
>> +				siw_sq_flush(qp);
>> +
>> +			siw_rq_flush(qp);
>> +
>> +			break;
>> +
>> +		default:
>> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> +				   siw_qp_state_to_string[qp->attrs.state],
>> +				   siw_qp_state_to_string[attrs->state]);
>> +
>> +			return -ECONNABORTED;
>> +		}
>> +		break;
>> +
>> +	default:
>> +		siw_dbg_qp(qp, " noop: state %s\n",
>> +			   siw_qp_state_to_string[qp->attrs.state]);
>> +		break;
>> +	}
>> +	if (drop_conn)
>> +		siw_qp_cm_drop(qp, 0);
>> +
>> +	return rv;
>> +}
>> +
>> +struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
>> +{
>> +	struct siw_qp *qp =  siw_qp_id2obj(to_siw_dev(base_dev), id);
>> +
>> +	if (qp) {
>> +		/*
>> +		 * siw_qp_id2obj() increments object reference count
>> +		 */
>> +		siw_qp_put(qp);
>> +		siw_dbg_qp(qp, "got base QP");
>> +
>> +		return &qp->base_qp;
>> +	}
>> +	return (struct ib_qp *)NULL;
>> +}
>> +
>> +/*
>> + * siw_check_mem()
>> + *
>> + * Check protection domain, STAG state, access permissions and
>> + * address range for memory object.
>> + *
>> + * @pd:		Protection Domain memory should belong to
>> + * @mem:	memory to be checked
>> + * @addr:	starting addr of mem
>> + * @perms:	requested access permissions
>> + * @len:	len of memory interval to be checked
>> + *
>> + */
>> +int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem, u64
>addr,
>> +		  enum siw_access_flags perms, int len)
>> +{
>> +	if (siw_mem2mr(mem)->pd != pd) {
>> +		siw_dbg(pd->hdr.sdev, "[PD %d]: pd mismatch\n", OBJ_ID(pd));
>> +		return -E_PD_MISMATCH;
>> +	}
>> +	if (!mem->stag_valid) {
>> +		siw_dbg(pd->hdr.sdev, "[PD %d]: stag 0x%08x invalid\n",
>> +			OBJ_ID(pd), OBJ_ID(mem));
>> +		return -E_STAG_INVALID;
>> +	}
>> +	/*
>> +	 * check access permissions
>> +	 */
>> +	if ((mem->perms & perms) < perms) {
>> +		siw_dbg(pd->hdr.sdev, "[PD %d]: permissions 0x%08x < 0x%08x\n",
>> +			OBJ_ID(pd), mem->perms, perms);
>> +		return -E_ACCESS_PERM;
>> +	}
>> +	/*
>> +	 * Check address interval: we relax check to allow memory
>shrinked
>> +	 * from the start address _after_ placing or fetching len bytes.
>> +	 * TODO: this relaxation is probably overdone
>> +	 */
>> +	if (addr < mem->va || addr + len > mem->va + mem->len) {
>> +		siw_dbg(pd->hdr.sdev, "[PD %d]: MEM interval len %d\n",
>> +			OBJ_ID(pd), len);
>> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx) out of bounds\n",
>> +			(unsigned long long)addr,
>> +			(unsigned long long)(addr + len));
>> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx] LKey=0x%08x\n",
>> +			(unsigned long long)mem->va,
>> +			(unsigned long long)(mem->va + mem->len),
>> +			OBJ_ID(mem));
>> +
>> +		return -E_BASE_BOUNDS;
>> +	}
>> +	return E_ACCESS_OK;
>> +}
>> +
>> +/*
>> + * siw_check_sge()
>> + *
>> + * Check SGE for access rights in given interval
>> + *
>> + * @pd:		Protection Domain memory should belong to
>> + * @sge:	SGE to be checked
>> + * @mem:	array of memory references
>> + * @perms:	requested access permissions
>> + * @off:	starting offset in SGE
>> + * @len:	len of memory interval to be checked
>> + *
>> + * NOTE: Function references SGE's memory object (mem->obj)
>> + * if not yet done. New reference is kept if check went ok and
>> + * released if check failed. If mem->obj is already valid, no new
>> + * lookup is being done and mem is not released it check fails.
>> + */
>> +int
>> +siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
>> +	      struct siw_mem *mem[], enum siw_access_flags perms,
>> +	      u32 off, int len)
>> +{
>> +	struct siw_device *sdev = pd->hdr.sdev;
>> +	int new_ref = 0, rv = E_ACCESS_OK;
>> +
>> +	if (len + off > sge->length) {
>> +		rv = -E_BASE_BOUNDS;
>> +		goto fail;
>> +	}
>> +	if (*mem == NULL) {
>> +		*mem = siw_mem_id2obj(sdev, sge->lkey >> 8);
>> +		if (*mem == NULL) {
>> +			rv = -E_STAG_INVALID;
>> +			goto fail;
>> +		}
>> +		new_ref = 1;
>> +	}
>> +
>> +	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
>> +	if (rv)
>> +		goto fail;
>> +
>> +	return 0;
>> +
>> +fail:
>> +	if (new_ref) {
>> +		siw_mem_put(*mem);
>> +		*mem = NULL;
>> +	}
>> +	return rv;
>> +}
>> +
>> +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
>> +{
>> +	rreq->id = sqe->id;
>> +	rreq->opcode = sqe->opcode;
>> +	rreq->sge[0].laddr = sqe->sge[0].laddr;
>> +	rreq->sge[0].length = sqe->sge[0].length;
>> +	rreq->sge[0].lkey = sqe->sge[0].lkey;
>> +	rreq->sge[1].lkey = sqe->sge[1].lkey;
>> +	rreq->flags = sqe->flags | SIW_WQE_VALID;
>> +	rreq->num_sge = 1;
>> +}
>> +
>> +/*
>> + * Must be called with SQ locked.
>> + * To avoid complete SQ starvation by constant inbound READ
>requests,
>> + * the active IRQ will not be served after qp->irq_burst, if the
>> + * SQ has pending work.
>> + */
>> +int siw_activate_tx(struct siw_qp *qp)
>> +{
>> +	struct siw_sqe	*irqe, *sqe;
>> +	struct siw_wqe	*wqe = tx_wqe(qp);
>> +	int rv = 1;
>> +
>> +	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
>> +
>> +	if (irqe->flags & SIW_WQE_VALID) {
>> +		sqe = sq_get_next(qp);
>> +
>> +		/*
>> +		 * Avoid local WQE processing starvation in case
>> +		 * of constant inbound READ request stream
>> +		 */
>> +		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
>> +			qp->irq_burst = 0;
>> +			goto skip_irq;
>> +		}
>> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> +		wqe->wr_status = SIW_WR_QUEUED;
>> +
>> +		/* start READ RESPONSE */
>> +		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
>> +		wqe->sqe.flags = 0;
>> +		if (irqe->num_sge) {
>> +			wqe->sqe.num_sge = 1;
>> +			wqe->sqe.sge[0].length = irqe->sge[0].length;
>> +			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
>> +			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
>> +		} else {
>> +			wqe->sqe.num_sge = 0;
>> +		}
>> +
>> +		/* Retain original RREQ's message sequence number for
>> +		 * potential error reporting cases.
>> +		 */
>> +		wqe->sqe.sge[1].length = irqe->sge[1].length;
>> +
>> +		wqe->sqe.rkey = irqe->rkey;
>> +		wqe->sqe.raddr = irqe->raddr;
>> +
>> +		wqe->processed = 0;
>> +		qp->irq_get++;
>> +
>> +		/* mark current IRQ entry free */
>> +		smp_store_mb(irqe->flags, 0);
>> +
>> +		goto out;
>> +	}
>> +
>> +	sqe = sq_get_next(qp);
>> +	if (sqe) {
>> +skip_irq:
>> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> +		wqe->wr_status = SIW_WR_QUEUED;
>> +
>> +		/* First copy SQE to kernel private memory */
>> +		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
>> +
>> +		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
>> +			rv = -EINVAL;
>> +			goto out;
>> +		}
>> +		if (wqe->sqe.flags & SIW_WQE_INLINE) {
>> +			if (wqe->sqe.opcode != SIW_OP_SEND &&
>> +			    wqe->sqe.opcode != SIW_OP_WRITE) {
>> +				rv = -EINVAL;
>> +				goto out;
>> +			}
>> +			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
>> +				rv = -EINVAL;
>> +				goto out;
>> +			}
>> +			wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
>> +			wqe->sqe.sge[0].lkey = 0;
>> +			wqe->sqe.num_sge = 1;
>> +		}
>> +		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
>> +			/* A READ cannot be fenced */
>> +			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
>> +			    wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV)) {
>> +				siw_dbg_qp(qp, "cannot fence read\n");
>> +				rv = -EINVAL;
>> +				goto out;
>> +			}
>> +			spin_lock(&qp->orq_lock);
>> +
>> +			if (!siw_orq_empty(qp)) {
>> +				qp->tx_ctx.orq_fence = 1;
>> +				rv = 0;
>> +			}
>> +			spin_unlock(&qp->orq_lock);
>> +
>> +		} else if (wqe->sqe.opcode == SIW_OP_READ ||
>> +			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
>> +			struct siw_sqe	*rreq;
>> +
>> +			wqe->sqe.num_sge = 1;
>> +
>> +			spin_lock(&qp->orq_lock);
>> +
>> +			rreq = orq_get_free(qp);
>> +			if (rreq) {
>> +				/*
>> +				 * Make an immediate copy in ORQ to be ready
>> +				 * to process loopback READ reply
>> +				 */
>> +				siw_read_to_orq(rreq, &wqe->sqe);
>> +				qp->orq_put++;
>> +			} else {
>> +				qp->tx_ctx.orq_fence = 1;
>> +				rv = 0;
>> +			}
>> +			spin_unlock(&qp->orq_lock);
>> +		}
>> +
>> +		/* Clear SQE, can be re-used by application */
>> +		smp_store_mb(sqe->flags, 0);
>> +		qp->sq_get++;
>> +	} else {
>> +		rv = 0;
>> +	}
>> +out:
>> +	if (unlikely(rv < 0)) {
>> +		siw_dbg_qp(qp, "error %d\n", rv);
>> +		wqe->wr_status = SIW_WR_IDLE;
>> +	}
>> +	return rv;
>> +}
>> +
>> +/*
>> + * Check if current CQ state qualifies for
>> + * calling CQ completion handler. Must be
>> + * called with CQ lock held.
>> + */
>> +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
>> +{
>> +	u64 cq_notify;
>> +
>> +	if (!cq->base_cq.comp_handler)
>> +		return false;
>> +
>> +	cq_notify = READ_ONCE(*cq->notify);
>> +
>> +	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
>> +	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
>> +	     (flags & SIW_WQE_SOLICITED))) {
>> +		/* dis-arm CQ */
>> +		smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
>> +
>> +		return true;
>> +	}
>> +	return false;
>> +}
>> +
>> +/* Must be called without holding CQ lock */
>> +static inline void siw_cq_completion(struct siw_cq *cq)
>> +{
>> +	siw_dbg_obj(cq, "Completion\n");
>> +	(*cq->base_cq.comp_handler)(&cq->base_cq,
>cq->base_cq.cq_context);
>> +}
>> +
>> +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32
>bytes,
>> +		     enum siw_wc_status status)
>> +{
>> +	struct siw_cq *cq = qp->scq;
>> +	int rv = 0;
>> +
>> +	if (cq) {
>> +		u32 sqe_flags = sqe->flags;
>> +		struct siw_cqe *cqe;
>> +		u32 idx;
>> +		unsigned long flags;
>> +
>> +		spin_lock_irqsave(&cq->lock, flags);
>> +
>> +		idx = cq->cq_put % cq->num_cqe;
>> +		cqe = &cq->queue[idx];
>> +
>> +		if (!READ_ONCE(cqe->flags)) {
>> +			bool notify;
>> +
>> +			cqe->id = sqe->id;
>> +			cqe->opcode = sqe->opcode;
>> +			cqe->status = status;
>> +			cqe->imm_data = 0;
>> +			cqe->bytes = bytes;
>> +
>> +			if (cq->kernel_verbs) {
>
>kernel_verbs is managed by Ib/core, why should driver know about it?
>
User land CQE's carry the corresponding QP ID, kernel clients expect a QP
pointer here. That's where the difference comes from. This distinction
between kernel and user clients is needed in more places, e.g. since the
user land's CQE array is memory mapped, where the kernel land's is not.
Leon Romanovsky Feb. 27, 2019, 12:48 p.m. UTC | #3
On Wed, Feb 27, 2019 at 11:54:07AM +0000, Bernard Metzler wrote:
> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>
> >To: "Bernard Metzler" <bmt@zurich.ibm.com>
> >From: "Leon Romanovsky" <leon@kernel.org>
> >Date: 02/24/2019 02:19PM
> >Cc: linux-rdma@vger.kernel.org
> >Subject: Re: [PATCH v5 08/13] SIW queue pair methods
> >
> >On Tue, Feb 19, 2019 at 11:08:58AM +0100, Bernard Metzler wrote:
> >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> >> ---
> >>  drivers/infiniband/sw/siw/siw_qp.c | 1478
> >++++++++++++++++++++++++++++
> >>  1 file changed, 1478 insertions(+)
> >>  create mode 100644 drivers/infiniband/sw/siw/siw_qp.c
> >>
> >> diff --git a/drivers/infiniband/sw/siw/siw_qp.c
> >b/drivers/infiniband/sw/siw/siw_qp.c
> >> new file mode 100644
> >> index 000000000000..75fd151dae39
> >> --- /dev/null
> >> +++ b/drivers/infiniband/sw/siw/siw_qp.c
> >> @@ -0,0 +1,1478 @@
> >> +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
> >> +/*
> >> + * Software iWARP device driver
> >> + *
> >> + * Authors: Bernard Metzler <bmt@zurich.ibm.com>
> >> + *
> >> + * Copyright (c) 2008-2018, IBM Corporation
> >> + *
> >> + * This software is available to you under a choice of one of two
> >> + * licenses. You may choose to be licensed under the terms of the
> >GNU
> >> + * General Public License (GPL) Version 2, available from the file
> >> + * COPYING in the main directory of this source tree, or the
> >> + * BSD license below:
> >> + *
> >> + *   Redistribution and use in source and binary forms, with or
> >> + *   without modification, are permitted provided that the
> >following
> >> + *   conditions are met:
> >> + *
> >> + *   - Redistributions of source code must retain the above
> >copyright notice,
> >> + *     this list of conditions and the following disclaimer.
> >> + *
> >> + *   - Redistributions in binary form must reproduce the above
> >copyright
> >> + *     notice, this list of conditions and the following
> >disclaimer in the
> >> + *     documentation and/or other materials provided with the
> >distribution.
> >> + *
> >> + *   - Neither the name of IBM nor the names of its contributors
> >may be
> >> + *     used to endorse or promote products derived from this
> >software without
> >> + *     specific prior written permission.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> >> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
> >OF
> >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> >> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> >HOLDERS
> >> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
> >AN
> >> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
> >IN
> >> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >THE
> >> + * SOFTWARE.
> >> + */
> >> +
> >> +#include <linux/errno.h>
> >> +#include <linux/types.h>
> >> +#include <linux/net.h>
> >> +#include <linux/file.h>
> >> +#include <linux/scatterlist.h>
> >> +#include <linux/highmem.h>
> >> +#include <linux/vmalloc.h>
> >> +#include <asm/barrier.h>
> >> +#include <net/sock.h>
> >> +#include <net/tcp_states.h>
> >> +#include <net/tcp.h>
> >> +
> >> +#include <rdma/iw_cm.h>
> >> +#include <rdma/ib_verbs.h>
> >> +#include <rdma/ib_smi.h>
> >> +#include <rdma/ib_user_verbs.h>
> >> +
> >> +#include "siw.h"
> >> +#include "siw_obj.h"
> >> +#include "siw_cm.h"
> >> +
> >> +static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof
> >"TERMINATE"] = {
> >> +	[SIW_QP_STATE_IDLE]		= "IDLE",
> >> +	[SIW_QP_STATE_RTR]		= "RTR",
> >> +	[SIW_QP_STATE_RTS]		= "RTS",
> >> +	[SIW_QP_STATE_CLOSING]		= "CLOSING",
> >> +	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
> >> +	[SIW_QP_STATE_ERROR]		= "ERROR"
> >> +};
> >> +
> >> +/*
> >> + * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp
> >settings on a
> >> + * per-RDMAP message basis. Please keep order of initializer. All
> >MPA len
> >> + * is initialized to minimum packet size.
> >> + */
> >> +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { {
> >> +	/* RDMAP_RDMA_WRITE */
> >> +	.hdr_len = sizeof(struct iwarp_rdma_write),
> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> +		| cpu_to_be16(RDMAP_RDMA_WRITE),
> >> +	.proc_data = siw_proc_write
> >> +},
> >> +{	/* RDMAP_RDMA_READ_REQ */
> >> +	.hdr_len = sizeof(struct iwarp_rdma_rreq),
> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> +		| cpu_to_be16(RDMAP_RDMA_READ_REQ),
> >> +	.proc_data = siw_proc_rreq
> >> +},
> >> +{	/* RDMAP_RDMA_READ_RESP */
> >> +	.hdr_len = sizeof(struct iwarp_rdma_rresp),
> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> +		| cpu_to_be16(RDMAP_RDMA_READ_RESP),
> >> +	.proc_data = siw_proc_rresp
> >> +},
> >> +{	/* RDMAP_SEND */
> >> +	.hdr_len = sizeof(struct iwarp_send),
> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> +		| cpu_to_be16(RDMAP_SEND),
> >> +	.proc_data = siw_proc_send
> >> +},
> >> +{	/* RDMAP_SEND_INVAL */
> >> +	.hdr_len = sizeof(struct iwarp_send_inv),
> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> +		| cpu_to_be16(RDMAP_SEND_INVAL),
> >> +	.proc_data = siw_proc_send
> >> +},
> >> +{	/* RDMAP_SEND_SE */
> >> +	.hdr_len = sizeof(struct iwarp_send),
> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> +		| cpu_to_be16(RDMAP_SEND_SE),
> >> +	.proc_data = siw_proc_send
> >> +},
> >> +{	/* RDMAP_SEND_SE_INVAL */
> >> +	.hdr_len = sizeof(struct iwarp_send_inv),
> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> +		| cpu_to_be16(RDMAP_SEND_SE_INVAL),
> >> +	.proc_data = siw_proc_send
> >> +},
> >> +{	/* RDMAP_TERMINATE */
> >> +	.hdr_len = sizeof(struct iwarp_terminate),
> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> +		| cpu_to_be16(RDMAP_TERMINATE),
> >> +	.proc_data = siw_proc_terminate
> >> +} };
> >> +
> >> +void siw_qp_llp_data_ready(struct sock *sk)
> >> +{
> >> +	struct siw_qp		*qp;
> >> +
> >> +	read_lock(&sk->sk_callback_lock);
> >> +
> >> +	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
> >> +		goto done;
> >> +
> >> +	qp = sk_to_qp(sk);
> >> +
> >> +	if (likely(!qp->rx_ctx.rx_suspend &&
> >> +		   down_read_trylock(&qp->state_lock))) {
> >> +		read_descriptor_t rd_desc = {.arg.data = qp, .count = 1};
> >> +
> >> +		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
> >> +			/*
> >> +			 * Implements data receive operation during
> >> +			 * socket callback. TCP gracefully catches
> >> +			 * the case where there is nothing to receive
> >> +			 * (not calling siw_tcp_rx_data() then).
> >> +			 */
> >> +			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
> >> +
> >> +		up_read(&qp->state_lock);
> >> +	} else {
> >> +		siw_dbg_qp(qp, "unable to rx, suspend: %d\n",
> >> +			   qp->rx_ctx.rx_suspend);
> >> +	}
> >> +done:
> >> +	read_unlock(&sk->sk_callback_lock);
> >> +}
> >> +
> >> +void siw_qp_llp_close(struct siw_qp *qp)
> >> +{
> >> +	siw_dbg_qp(qp, "enter llp close, state = %s\n",
> >> +		   siw_qp_state_to_string[qp->attrs.state]);
> >> +
> >> +	down_write(&qp->state_lock);
> >> +
> >> +	qp->rx_ctx.rx_suspend = 1;
> >> +	qp->tx_ctx.tx_suspend = 1;
> >> +	qp->attrs.sk = NULL;
> >> +
> >> +	switch (qp->attrs.state) {
> >> +
> >> +	case SIW_QP_STATE_RTS:
> >> +	case SIW_QP_STATE_RTR:
> >> +	case SIW_QP_STATE_IDLE:
> >> +	case SIW_QP_STATE_TERMINATE:
> >> +
> >> +		qp->attrs.state = SIW_QP_STATE_ERROR;
> >> +
> >> +		break;
> >> +	/*
> >> +	 * SIW_QP_STATE_CLOSING:
> >> +	 *
> >> +	 * This is a forced close. shall the QP be moved to
> >> +	 * ERROR or IDLE ?
> >> +	 */
> >> +	case SIW_QP_STATE_CLOSING:
> >> +		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> +		else
> >> +			qp->attrs.state = SIW_QP_STATE_IDLE;
> >> +
> >> +		break;
> >> +
> >> +	default:
> >> +		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
> >> +			   siw_qp_state_to_string[qp->attrs.state]);
> >> +		break;
> >> +	}
> >> +	siw_sq_flush(qp);
> >> +	siw_rq_flush(qp);
> >> +
> >> +	/*
> >> +	 * Dereference closing CEP
> >> +	 */
> >> +	if (qp->cep) {
> >> +		siw_cep_put(qp->cep);
> >> +		qp->cep = NULL;
> >> +	}
> >> +
> >> +	up_write(&qp->state_lock);
> >> +
> >> +	siw_dbg_qp(qp, "llp close exit: state %s\n",
> >> +		   siw_qp_state_to_string[qp->attrs.state]);
> >> +}
> >> +
> >> +/*
> >> + * socket callback routine informing about newly available send
> >space.
> >> + * Function schedules SQ work for processing SQ items.
> >> + */
> >> +void siw_qp_llp_write_space(struct sock *sk)
> >> +{
> >> +	struct siw_cep	*cep = sk_to_cep(sk);
> >> +
> >> +	cep->sk_write_space(sk);
> >> +
> >> +	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
> >> +		(void) siw_sq_start(cep->qp);
> >> +}
> >> +
> >> +static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int
> >orq_size)
> >> +{
> >> +	if (!irq_size)
> >> +		irq_size = 1;
> >> +	if (!orq_size)
> >> +		orq_size = 1;
> >> +
> >> +	qp->attrs.irq_size = irq_size;
> >> +	qp->attrs.orq_size = orq_size;
> >> +
> >> +	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
> >> +	if (!qp->irq) {
> >> +		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
> >> +		qp->attrs.irq_size = 0;
> >> +		return -ENOMEM;
> >> +	}
> >> +	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
> >> +	if (!qp->orq) {
> >> +		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
> >> +		qp->attrs.orq_size = 0;
> >> +		qp->attrs.irq_size = 0;
> >> +		vfree(qp->irq);
> >> +		return -ENOMEM;
> >> +	}
> >> +	return 0;
> >> +}
> >> +
> >> +static int siw_qp_enable_crc(struct siw_qp *qp)
> >> +{
> >> +	struct siw_iwarp_rx *c_rx = &qp->rx_ctx;
> >> +	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
> >> +	int rv = 0;
> >> +
> >> +	if (siw_crypto_shash == NULL) {
> >> +		rv = -ENOENT;
> >> +		goto error;
> >> +	}
> >> +	c_tx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
> >> +				   crypto_shash_descsize(siw_crypto_shash),
> >> +				   GFP_KERNEL);
> >> +	c_rx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
> >> +				   crypto_shash_descsize(siw_crypto_shash),
> >> +				   GFP_KERNEL);
> >> +	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
> >> +		rv = -ENOMEM;
> >> +		goto error;
> >> +	}
> >> +	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
> >> +	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
> >> +
> >> +	return 0;
> >> +error:
> >> +	siw_dbg_qp(qp, "falied loading crc32c. error %d\n", rv);
> >> +
> >> +	kfree(c_tx->mpa_crc_hd);
> >> +	kfree(c_rx->mpa_crc_hd);
> >> +
> >> +	c_tx->mpa_crc_hd = c_rx->mpa_crc_hd = NULL;
> >> +
> >> +	return rv;
> >> +}
> >> +
> >> +/*
> >> + * Send a non signalled READ or WRITE to peer side as negotiated
> >> + * with MPAv2 P2P setup protocol. The work request is only created
> >> + * as a current active WR and does not consume Send Queue space.
> >> + *
> >> + * Caller must hold QP state lock.
> >> + */
> >> +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
> >> +{
> >> +	struct siw_wqe	*wqe = tx_wqe(qp);
> >> +	unsigned long flags;
> >> +	int rv = 0;
> >> +
> >> +	spin_lock_irqsave(&qp->sq_lock, flags);
> >> +
> >> +	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
> >> +		spin_unlock_irqrestore(&qp->sq_lock, flags);
> >> +		return -EIO;
> >> +	}
> >> +	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> >> +
> >> +	wqe->wr_status = SIW_WR_QUEUED;
> >> +	wqe->sqe.flags = 0;
> >> +	wqe->sqe.num_sge = 1;
> >> +	wqe->sqe.sge[0].length = 0;
> >> +	wqe->sqe.sge[0].laddr = 0;
> >> +	wqe->sqe.sge[0].lkey = 0;
> >> +	/*
> >> +	 * While it must not be checked for inbound zero length
> >> +	 * READ/WRITE, some HW may treat STag 0 special.
> >> +	 */
> >> +	wqe->sqe.rkey = 1;
> >> +	wqe->sqe.raddr = 0;
> >> +	wqe->processed = 0;
> >> +
> >> +	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
> >> +		wqe->sqe.opcode = SIW_OP_WRITE;
> >> +	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
> >> +		struct siw_sqe	*rreq;
> >> +
> >> +		wqe->sqe.opcode = SIW_OP_READ;
> >> +
> >> +		spin_lock(&qp->orq_lock);
> >> +
> >> +		rreq = orq_get_free(qp);
> >> +		if (rreq) {
> >> +			siw_read_to_orq(rreq, &wqe->sqe);
> >> +			qp->orq_put++;
> >> +		} else
> >> +			rv = -EIO;
> >> +
> >> +		spin_unlock(&qp->orq_lock);
> >> +	} else
> >> +		rv = -EINVAL;
> >> +
> >> +	if (rv)
> >> +		wqe->wr_status = SIW_WR_IDLE;
> >> +
> >> +	spin_unlock_irqrestore(&qp->sq_lock, flags);
> >> +
> >> +	if (!rv)
> >> +		rv = siw_sq_start(qp);
> >> +
> >> +	return rv;
> >> +}
> >> +
> >> +/*
> >> + * Map memory access error to DDP tagged error
> >> + */
> >> +enum ddp_ecode siw_tagged_error(enum siw_access_state state)
> >> +{
> >> +	if (state == E_STAG_INVALID)
> >> +		return DDP_ECODE_T_INVALID_STAG;
> >> +	if (state == E_BASE_BOUNDS)
> >> +		return DDP_ECODE_T_BASE_BOUNDS;
> >> +	if (state == E_PD_MISMATCH)
> >> +		return DDP_ECODE_T_STAG_NOT_ASSOC;
> >> +	if (state == E_ACCESS_PERM)
> >> +		/*
> >> +		 * RFC 5041 (DDP) lacks an ecode for insufficient access
> >> +		 * permissions. 'Invalid STag' seem to be the closest
> >> +		 * match though.
> >> +		 */
> >> +		return DDP_ECODE_T_INVALID_STAG;
> >> +
> >> +	WARN_ON(1);
> >> +
> >> +	return DDP_ECODE_T_INVALID_STAG;
> >> +}
> >> +
> >> +/*
> >> + * Map memory access error to RDMAP protection error
> >> + */
> >> +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
> >> +{
> >> +	if (state == E_STAG_INVALID)
> >> +		return RDMAP_ECODE_INVALID_STAG;
> >> +	if (state == E_BASE_BOUNDS)
> >> +		return RDMAP_ECODE_BASE_BOUNDS;
> >> +	if (state == E_PD_MISMATCH)
> >> +		return RDMAP_ECODE_STAG_NOT_ASSOC;
> >> +	if (state == E_ACCESS_PERM)
> >> +		return RDMAP_ECODE_ACCESS_RIGHTS;
> >> +
> >> +	return RDMAP_ECODE_UNSPECIFIED;
> >> +}
> >> +
> >> +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer,
> >> +			u8 etype, u8 ecode, int in_tx)
> >> +{
> >> +	if (!qp->term_info.valid) {
> >> +		memset(&qp->term_info, 0, sizeof(qp->term_info));
> >> +		qp->term_info.layer = layer;
> >> +		qp->term_info.etype = etype;
> >> +		qp->term_info.ecode = ecode;
> >> +		qp->term_info.in_tx = in_tx;
> >> +		qp->term_info.valid = 1;
> >> +	}
> >> +	siw_dbg_qp(qp,
> >> +		   "init TERM: layer %d, type %d, code %d, in tx %s\n",
> >> +		   layer, etype, ecode, in_tx ? "yes" : "no");
> >> +}
> >> +
> >> +/*
> >> + * Send a TERMINATE message, as defined in RFC's
> >5040/5041/5044/6581.
> >> + * Sending TERMINATE messages is best effort - such messages
> >> + * can only be send if the QP is still connected and it does
> >> + * not have another outbound message in-progress, i.e. the
> >> + * TERMINATE message must not interfer with an incomplete current
> >> + * transmit operation.
> >> + */
> >> +void siw_send_terminate(struct siw_qp *qp)
> >> +{
> >> +	struct kvec		iov[3];
> >> +	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT|MSG_EOR};
> >> +	struct iwarp_terminate	*term = NULL;
> >> +	union iwarp_hdr		*err_hdr = NULL;
> >> +	struct socket		*s = qp->attrs.sk;
> >> +	struct siw_iwarp_rx	*rx_ctx = &qp->rx_ctx;
> >> +	union iwarp_hdr		*rx_hdr = &rx_ctx->hdr;
> >> +	u32 crc = 0;
> >> +	int num_frags, len_terminate, rv;
> >> +
> >> +	if (!qp->term_info.valid)
> >> +		return;
> >> +
> >> +	qp->term_info.valid = 0;
> >> +
> >> +	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
> >> +		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
> >> +			   tx_type(tx_wqe(qp)));
> >> +		return;
> >> +	}
> >> +	if (!s && qp->cep)
> >> +		/* QP not yet in RTS. Take socket from connection end point */
> >> +		s = qp->cep->llp.sock;
> >> +
> >> +	if (!s) {
> >> +		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
> >> +		return;
> >> +	}
> >> +
> >> +	term = kzalloc(sizeof(*term), GFP_KERNEL);
> >> +	if (!term)
> >> +		return;
> >> +
> >> +	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
> >> +	term->ddp_mo = 0;
> >> +	term->ddp_msn = cpu_to_be32(1);
> >> +
> >> +	iov[0].iov_base = term;
> >> +	iov[0].iov_len = sizeof(*term);
> >> +
> >> +	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
> >> +	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
> >> +	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
> >> +		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
> >> +		if (!err_hdr) {
> >> +			kfree(term);
> >> +			return;
> >> +		}
> >> +	}
> >> +	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
> >> +	       sizeof(struct iwarp_ctrl));
> >> +
> >> +	__rdmap_term_set_layer(term, qp->term_info.layer);
> >> +	__rdmap_term_set_etype(term, qp->term_info.etype);
> >> +	__rdmap_term_set_ecode(term, qp->term_info.ecode);
> >> +
> >> +	switch (qp->term_info.layer) {
> >> +
> >> +	case TERM_ERROR_LAYER_RDMAP:
> >> +		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
> >> +			/* No additional DDP/RDMAP header to be included */
> >> +			break;
> >> +
> >> +		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
> >> +			/*
> >> +			 * Complete RDMAP frame will get attached, and
> >> +			 * DDP segment length is valid
> >> +			 */
> >> +			term->flag_m = 1;
> >> +			term->flag_d = 1;
> >> +			term->flag_r = 1;
> >> +
> >> +			if (qp->term_info.in_tx) {
> >> +				struct iwarp_rdma_rreq *rreq;
> >> +				struct siw_wqe *wqe = tx_wqe(qp);
> >> +
> >> +				/* Inbound RREQ error, detected during
> >> +				 * RRESP creation. Take state from
> >> +				 * current TX work queue element to
> >> +				 * reconstruct peers RREQ.
> >> +				 */
> >> +				rreq = (struct iwarp_rdma_rreq *)err_hdr;
> >> +
> >> +				memcpy(&rreq->ctrl,
> >> +				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
> >> +				       sizeof(struct iwarp_ctrl));
> >> +
> >> +				rreq->rsvd = 0;
> >> +				rreq->ddp_qn =
> >> +					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
> >> +
> >> +				/* Provide RREQ's MSN as kept aside */
> >> +				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
> >> +
> >> +				rreq->ddp_mo = htonl(wqe->processed);
> >> +				rreq->sink_stag = htonl(wqe->sqe.rkey);
> >> +				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
> >> +				rreq->read_size = htonl(wqe->sqe.sge[0].length);
> >> +				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
> >> +				rreq->source_to =
> >> +					cpu_to_be64(wqe->sqe.sge[0].laddr);
> >> +
> >> +				iov[1].iov_base = rreq;
> >> +				iov[1].iov_len = sizeof(*rreq);
> >> +
> >> +				rx_hdr = (union iwarp_hdr *)rreq;
> >> +			} else {
> >> +				/* Take RDMAP/DDP information from
> >> +				 * current (failed) inbound frame.
> >> +				 */
> >> +				iov[1].iov_base = rx_hdr;
> >> +
> >> +				if (__rdmap_opcode(&rx_hdr->ctrl) ==
> >> +				    RDMAP_RDMA_READ_REQ)
> >> +					iov[1].iov_len =
> >> +						sizeof(struct iwarp_rdma_rreq);
> >> +				else /* SEND type */
> >> +					iov[1].iov_len =
> >> +						sizeof(struct iwarp_send);
> >> +			}
> >> +		} else {
> >> +			/* Do not report DDP hdr information if packet
> >> +			 * layout is unknown
> >> +			 */
> >> +			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
> >> +			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
> >> +				break;
> >> +
> >> +			iov[1].iov_base = rx_hdr;
> >> +
> >> +			/* Only DDP frame will get attached */
> >> +			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
> >> +				iov[1].iov_len =
> >> +					sizeof(struct iwarp_rdma_write);
> >> +			else
> >> +				iov[1].iov_len = sizeof(struct iwarp_send);
> >> +
> >> +			term->flag_m = 1;
> >> +			term->flag_d = 1;
> >> +		}
> >> +		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
> >> +
> >> +		break;
> >> +
> >> +	case TERM_ERROR_LAYER_DDP:
> >> +		/* Report error encountered while DDP processing.
> >> +		 * This can only happen as a result of inbound
> >> +		 * DDP processing
> >> +		 */
> >> +
> >> +		/* Do not report DDP hdr information if packet
> >> +		 * layout is unknown
> >> +		 */
> >> +		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
> >> +		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
> >> +		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
> >> +		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
> >> +			break;
> >> +
> >> +		iov[1].iov_base = rx_hdr;
> >> +
> >> +		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
> >> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
> >> +		else
> >> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
> >> +
> >> +		term->flag_m = 1;
> >> +		term->flag_d = 1;
> >> +
> >> +		break;
> >> +
> >> +	default:
> >> +		break;
> >> +
> >> +	}
> >> +	if (term->flag_m || term->flag_d || term->flag_r) {
> >> +		iov[2].iov_base = &crc;
> >> +		iov[2].iov_len = sizeof(crc);
> >> +		len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
> >> +		num_frags = 3;
> >> +	} else {
> >> +		iov[1].iov_base = &crc;
> >> +		iov[1].iov_len = sizeof(crc);
> >> +		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
> >> +		num_frags = 2;
> >> +	}
> >> +
> >> +	/* Adjust DDP Segment Length parameter, if valid */
> >> +	if (term->flag_m) {
> >> +		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
> >> +		enum rdma_opcode op = __rdmap_opcode(&rx_hdr->ctrl);
> >> +
> >> +		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
> >> +		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
> >> +	}
> >> +
> >> +	term->ctrl.mpa_len = cpu_to_be16(len_terminate -
> >> +					 (MPA_HDR_SIZE + MPA_CRC_SIZE));
> >> +	if (qp->tx_ctx.mpa_crc_hd) {
> >> +		crypto_shash_init(rx_ctx->mpa_crc_hd);
> >> +		if (siw_crc_array(rx_ctx->mpa_crc_hd, (u8 *)iov[0].iov_base,
> >> +				  iov[0].iov_len))
> >> +			goto out;
> >> +
> >> +		if (num_frags == 3) {
> >> +			if (siw_crc_array(rx_ctx->mpa_crc_hd,
> >> +					  (u8 *)iov[1].iov_base,
> >> +					  iov[1].iov_len))
> >> +				goto out;
> >> +		}
> >> +		crypto_shash_final(rx_ctx->mpa_crc_hd, (u8 *)&crc);
> >> +	}
> >> +
> >> +	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
> >> +	siw_dbg_qp(qp,
> >> +		   "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
> >> +		   rv == len_terminate ? "success" : "failure",
> >> +		   __rdmap_term_layer(term), __rdmap_term_etype(term),
> >> +		   __rdmap_term_ecode(term), rv);
> >> +out:
> >> +	kfree(term);
> >> +	kfree(err_hdr);
> >> +}
> >> +
> >> +/*
> >> + * handle all attrs other than state
> >> + */
> >> +static void siw_qp_modify_nonstate(struct siw_qp *qp,
> >> +				   struct siw_qp_attrs *attrs,
> >> +				   enum siw_qp_attr_mask mask)
> >> +{
> >> +	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
> >> +		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
> >> +			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
> >> +		else
> >> +			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
> >> +
> >> +		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
> >> +			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
> >> +		else
> >> +			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
> >> +
> >> +		if (attrs->flags & SIW_RDMA_READ_ENABLED)
> >> +			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
> >> +		else
> >> +			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
> >> +	}
> >> +}
> >> +
> >> +/*
> >> + * caller holds qp->state_lock
> >> + */
> >> +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
> >> +		  enum siw_qp_attr_mask mask)
> >> +{
> >> +	int	drop_conn = 0, rv = 0;
> >> +
> >> +	if (!mask)
> >> +		return 0;
> >> +
> >> +	siw_dbg_qp(qp, "state: %s => %s\n",
> >> +		   siw_qp_state_to_string[qp->attrs.state],
> >> +		   siw_qp_state_to_string[attrs->state]);
> >> +
> >> +	if (mask != SIW_QP_ATTR_STATE)
> >> +		siw_qp_modify_nonstate(qp, attrs, mask);
> >> +
> >> +	if (!(mask & SIW_QP_ATTR_STATE))
> >> +		return 0;
> >> +
> >> +	switch (qp->attrs.state) {
> >> +
> >> +	case SIW_QP_STATE_IDLE:
> >> +	case SIW_QP_STATE_RTR:
> >> +
> >> +		switch (attrs->state) {
> >> +
> >> +		case SIW_QP_STATE_RTS:
> >> +
> >> +			if (attrs->flags & SIW_MPA_CRC) {
> >> +				rv = siw_qp_enable_crc(qp);
> >> +				if (rv)
> >> +					break;
> >> +			}
> >> +			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
> >> +				siw_dbg_qp(qp, "no socket\n");
> >> +				rv = -EINVAL;
> >> +				break;
> >> +			}
> >> +			if (!(mask & SIW_QP_ATTR_MPA)) {
> >> +				siw_dbg_qp(qp, "no MPA\n");
> >> +				rv = -EINVAL;
> >> +				break;
> >> +			}
> >> +			siw_dbg_qp(qp, "enter rts, peer 0x%08x, loc 0x%08x\n",
> >> +				   qp->cep->llp.raddr.sin_addr.s_addr,
> >> +				   qp->cep->llp.laddr.sin_addr.s_addr);
> >> +			/*
> >> +			 * Initialize iWARP TX state
> >> +			 */
> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
> >> +
> >> +			/*
> >> +			 * Initialize iWARP RX state
> >> +			 */
> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
> >> +
> >> +			/*
> >> +			 * init IRD free queue, caller has already checked
> >> +			 * limits.
> >> +			 */
> >> +			rv = siw_qp_readq_init(qp, attrs->irq_size,
> >> +					       attrs->orq_size);
> >> +			if (rv)
> >> +				break;
> >> +
> >> +			qp->attrs.sk = attrs->sk;
> >> +			qp->attrs.state = SIW_QP_STATE_RTS;
> >> +
> >> +			break;
> >> +
> >> +		case SIW_QP_STATE_ERROR:
> >> +			siw_rq_flush(qp);
> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> +			if (qp->cep) {
> >> +				siw_cep_put(qp->cep);
> >> +				qp->cep = NULL;
> >> +			}
> >> +			break;
> >> +
> >> +		case SIW_QP_STATE_RTR:
> >> +			/* ignore */
> >> +			break;
> >> +
> >> +		default:
> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> >> +				   siw_qp_state_to_string[qp->attrs.state],
> >> +				   siw_qp_state_to_string[attrs->state]);
> >> +			break;
> >> +		}
> >> +		break;
> >> +
> >> +	case SIW_QP_STATE_RTS:
> >> +
> >> +		switch (attrs->state) {
> >> +
> >> +		case SIW_QP_STATE_CLOSING:
> >> +			/*
> >> +			 * Verbs: move to IDLE if SQ and ORQ are empty.
> >> +			 * Move to ERROR otherwise. But first of all we must
> >> +			 * close the connection. So we keep CLOSING or ERROR
> >> +			 * as a transient state, schedule connection drop work
> >> +			 * and wait for the socket state change upcall to
> >> +			 * come back closed.
> >> +			 */
> >> +			if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
> >> +				qp->attrs.state = SIW_QP_STATE_CLOSING;
> >> +			} else {
> >> +				qp->attrs.state = SIW_QP_STATE_ERROR;
> >> +				siw_sq_flush(qp);
> >> +			}
> >> +			siw_rq_flush(qp);
> >> +
> >> +			drop_conn = 1;
> >> +			break;
> >> +
> >> +		case SIW_QP_STATE_TERMINATE:
> >> +			qp->attrs.state = SIW_QP_STATE_TERMINATE;
> >> +
> >> +			siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
> >> +					   RDMAP_ETYPE_CATASTROPHIC,
> >> +					   RDMAP_ECODE_UNSPECIFIED, 1);
> >> +			drop_conn = 1;
> >> +
> >> +			break;
> >> +
> >> +		case SIW_QP_STATE_ERROR:
> >> +			/*
> >> +			 * This is an emergency close.
> >> +			 *
> >> +			 * Any in progress transmit operation will get
> >> +			 * cancelled.
> >> +			 * This will likely result in a protocol failure,
> >> +			 * if a TX operation is in transit. The caller
> >> +			 * could unconditional wait to give the current
> >> +			 * operation a chance to complete.
> >> +			 * Esp., how to handle the non-empty IRQ case?
> >> +			 * The peer was asking for data transfer at a valid
> >> +			 * point in time.
> >> +			 */
> >> +			siw_sq_flush(qp);
> >> +			siw_rq_flush(qp);
> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> +			drop_conn = 1;
> >> +
> >> +			break;
> >> +
> >> +		default:
> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> >> +				   siw_qp_state_to_string[qp->attrs.state],
> >> +				   siw_qp_state_to_string[attrs->state]);
> >> +			break;
> >> +		}
> >> +		break;
> >> +
> >> +	case SIW_QP_STATE_TERMINATE:
> >> +
> >> +		switch (attrs->state) {
> >> +
> >> +		case SIW_QP_STATE_ERROR:
> >> +			siw_rq_flush(qp);
> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> +
> >> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
> >> +				siw_sq_flush(qp);
> >> +
> >> +			break;
> >> +
> >> +		default:
> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> >> +				   siw_qp_state_to_string[qp->attrs.state],
> >> +				   siw_qp_state_to_string[attrs->state]);
> >> +		}
> >> +		break;
> >> +
> >> +	case SIW_QP_STATE_CLOSING:
> >> +
> >> +		switch (attrs->state) {
> >> +
> >> +		case SIW_QP_STATE_IDLE:
> >> +			WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
> >> +			qp->attrs.state = SIW_QP_STATE_IDLE;
> >> +
> >> +			break;
> >> +
> >> +		case SIW_QP_STATE_CLOSING:
> >> +			/*
> >> +			 * The LLP may already moved the QP to closing
> >> +			 * due to graceful peer close init
> >> +			 */
> >> +			break;
> >> +
> >> +		case SIW_QP_STATE_ERROR:
> >> +			/*
> >> +			 * QP was moved to CLOSING by LLP event
> >> +			 * not yet seen by user.
> >> +			 */
> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> +
> >> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
> >> +				siw_sq_flush(qp);
> >> +
> >> +			siw_rq_flush(qp);
> >> +
> >> +			break;
> >> +
> >> +		default:
> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> >> +				   siw_qp_state_to_string[qp->attrs.state],
> >> +				   siw_qp_state_to_string[attrs->state]);
> >> +
> >> +			return -ECONNABORTED;
> >> +		}
> >> +		break;
> >> +
> >> +	default:
> >> +		siw_dbg_qp(qp, " noop: state %s\n",
> >> +			   siw_qp_state_to_string[qp->attrs.state]);
> >> +		break;
> >> +	}
> >> +	if (drop_conn)
> >> +		siw_qp_cm_drop(qp, 0);
> >> +
> >> +	return rv;
> >> +}
> >> +
> >> +struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
> >> +{
> >> +	struct siw_qp *qp =  siw_qp_id2obj(to_siw_dev(base_dev), id);
> >> +
> >> +	if (qp) {
> >> +		/*
> >> +		 * siw_qp_id2obj() increments object reference count
> >> +		 */
> >> +		siw_qp_put(qp);
> >> +		siw_dbg_qp(qp, "got base QP");
> >> +
> >> +		return &qp->base_qp;
> >> +	}
> >> +	return (struct ib_qp *)NULL;
> >> +}
> >> +
> >> +/*
> >> + * siw_check_mem()
> >> + *
> >> + * Check protection domain, STAG state, access permissions and
> >> + * address range for memory object.
> >> + *
> >> + * @pd:		Protection Domain memory should belong to
> >> + * @mem:	memory to be checked
> >> + * @addr:	starting addr of mem
> >> + * @perms:	requested access permissions
> >> + * @len:	len of memory interval to be checked
> >> + *
> >> + */
> >> +int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem, u64
> >addr,
> >> +		  enum siw_access_flags perms, int len)
> >> +{
> >> +	if (siw_mem2mr(mem)->pd != pd) {
> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: pd mismatch\n", OBJ_ID(pd));
> >> +		return -E_PD_MISMATCH;
> >> +	}
> >> +	if (!mem->stag_valid) {
> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: stag 0x%08x invalid\n",
> >> +			OBJ_ID(pd), OBJ_ID(mem));
> >> +		return -E_STAG_INVALID;
> >> +	}
> >> +	/*
> >> +	 * check access permissions
> >> +	 */
> >> +	if ((mem->perms & perms) < perms) {
> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: permissions 0x%08x < 0x%08x\n",
> >> +			OBJ_ID(pd), mem->perms, perms);
> >> +		return -E_ACCESS_PERM;
> >> +	}
> >> +	/*
> >> +	 * Check address interval: we relax check to allow memory
> >shrinked
> >> +	 * from the start address _after_ placing or fetching len bytes.
> >> +	 * TODO: this relaxation is probably overdone
> >> +	 */
> >> +	if (addr < mem->va || addr + len > mem->va + mem->len) {
> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: MEM interval len %d\n",
> >> +			OBJ_ID(pd), len);
> >> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx) out of bounds\n",
> >> +			(unsigned long long)addr,
> >> +			(unsigned long long)(addr + len));
> >> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx] LKey=0x%08x\n",
> >> +			(unsigned long long)mem->va,
> >> +			(unsigned long long)(mem->va + mem->len),
> >> +			OBJ_ID(mem));
> >> +
> >> +		return -E_BASE_BOUNDS;
> >> +	}
> >> +	return E_ACCESS_OK;
> >> +}
> >> +
> >> +/*
> >> + * siw_check_sge()
> >> + *
> >> + * Check SGE for access rights in given interval
> >> + *
> >> + * @pd:		Protection Domain memory should belong to
> >> + * @sge:	SGE to be checked
> >> + * @mem:	array of memory references
> >> + * @perms:	requested access permissions
> >> + * @off:	starting offset in SGE
> >> + * @len:	len of memory interval to be checked
> >> + *
> >> + * NOTE: Function references SGE's memory object (mem->obj)
> >> + * if not yet done. New reference is kept if check went ok and
> >> + * released if check failed. If mem->obj is already valid, no new
> >> + * lookup is being done and mem is not released it check fails.
> >> + */
> >> +int
> >> +siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
> >> +	      struct siw_mem *mem[], enum siw_access_flags perms,
> >> +	      u32 off, int len)
> >> +{
> >> +	struct siw_device *sdev = pd->hdr.sdev;
> >> +	int new_ref = 0, rv = E_ACCESS_OK;
> >> +
> >> +	if (len + off > sge->length) {
> >> +		rv = -E_BASE_BOUNDS;
> >> +		goto fail;
> >> +	}
> >> +	if (*mem == NULL) {
> >> +		*mem = siw_mem_id2obj(sdev, sge->lkey >> 8);
> >> +		if (*mem == NULL) {
> >> +			rv = -E_STAG_INVALID;
> >> +			goto fail;
> >> +		}
> >> +		new_ref = 1;
> >> +	}
> >> +
> >> +	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
> >> +	if (rv)
> >> +		goto fail;
> >> +
> >> +	return 0;
> >> +
> >> +fail:
> >> +	if (new_ref) {
> >> +		siw_mem_put(*mem);
> >> +		*mem = NULL;
> >> +	}
> >> +	return rv;
> >> +}
> >> +
> >> +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
> >> +{
> >> +	rreq->id = sqe->id;
> >> +	rreq->opcode = sqe->opcode;
> >> +	rreq->sge[0].laddr = sqe->sge[0].laddr;
> >> +	rreq->sge[0].length = sqe->sge[0].length;
> >> +	rreq->sge[0].lkey = sqe->sge[0].lkey;
> >> +	rreq->sge[1].lkey = sqe->sge[1].lkey;
> >> +	rreq->flags = sqe->flags | SIW_WQE_VALID;
> >> +	rreq->num_sge = 1;
> >> +}
> >> +
> >> +/*
> >> + * Must be called with SQ locked.
> >> + * To avoid complete SQ starvation by constant inbound READ
> >requests,
> >> + * the active IRQ will not be served after qp->irq_burst, if the
> >> + * SQ has pending work.
> >> + */
> >> +int siw_activate_tx(struct siw_qp *qp)
> >> +{
> >> +	struct siw_sqe	*irqe, *sqe;
> >> +	struct siw_wqe	*wqe = tx_wqe(qp);
> >> +	int rv = 1;
> >> +
> >> +	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
> >> +
> >> +	if (irqe->flags & SIW_WQE_VALID) {
> >> +		sqe = sq_get_next(qp);
> >> +
> >> +		/*
> >> +		 * Avoid local WQE processing starvation in case
> >> +		 * of constant inbound READ request stream
> >> +		 */
> >> +		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
> >> +			qp->irq_burst = 0;
> >> +			goto skip_irq;
> >> +		}
> >> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> >> +		wqe->wr_status = SIW_WR_QUEUED;
> >> +
> >> +		/* start READ RESPONSE */
> >> +		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
> >> +		wqe->sqe.flags = 0;
> >> +		if (irqe->num_sge) {
> >> +			wqe->sqe.num_sge = 1;
> >> +			wqe->sqe.sge[0].length = irqe->sge[0].length;
> >> +			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
> >> +			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
> >> +		} else {
> >> +			wqe->sqe.num_sge = 0;
> >> +		}
> >> +
> >> +		/* Retain original RREQ's message sequence number for
> >> +		 * potential error reporting cases.
> >> +		 */
> >> +		wqe->sqe.sge[1].length = irqe->sge[1].length;
> >> +
> >> +		wqe->sqe.rkey = irqe->rkey;
> >> +		wqe->sqe.raddr = irqe->raddr;
> >> +
> >> +		wqe->processed = 0;
> >> +		qp->irq_get++;
> >> +
> >> +		/* mark current IRQ entry free */
> >> +		smp_store_mb(irqe->flags, 0);
> >> +
> >> +		goto out;
> >> +	}
> >> +
> >> +	sqe = sq_get_next(qp);
> >> +	if (sqe) {
> >> +skip_irq:
> >> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> >> +		wqe->wr_status = SIW_WR_QUEUED;
> >> +
> >> +		/* First copy SQE to kernel private memory */
> >> +		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
> >> +
> >> +		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
> >> +			rv = -EINVAL;
> >> +			goto out;
> >> +		}
> >> +		if (wqe->sqe.flags & SIW_WQE_INLINE) {
> >> +			if (wqe->sqe.opcode != SIW_OP_SEND &&
> >> +			    wqe->sqe.opcode != SIW_OP_WRITE) {
> >> +				rv = -EINVAL;
> >> +				goto out;
> >> +			}
> >> +			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
> >> +				rv = -EINVAL;
> >> +				goto out;
> >> +			}
> >> +			wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
> >> +			wqe->sqe.sge[0].lkey = 0;
> >> +			wqe->sqe.num_sge = 1;
> >> +		}
> >> +		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
> >> +			/* A READ cannot be fenced */
> >> +			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
> >> +			    wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV)) {
> >> +				siw_dbg_qp(qp, "cannot fence read\n");
> >> +				rv = -EINVAL;
> >> +				goto out;
> >> +			}
> >> +			spin_lock(&qp->orq_lock);
> >> +
> >> +			if (!siw_orq_empty(qp)) {
> >> +				qp->tx_ctx.orq_fence = 1;
> >> +				rv = 0;
> >> +			}
> >> +			spin_unlock(&qp->orq_lock);
> >> +
> >> +		} else if (wqe->sqe.opcode == SIW_OP_READ ||
> >> +			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
> >> +			struct siw_sqe	*rreq;
> >> +
> >> +			wqe->sqe.num_sge = 1;
> >> +
> >> +			spin_lock(&qp->orq_lock);
> >> +
> >> +			rreq = orq_get_free(qp);
> >> +			if (rreq) {
> >> +				/*
> >> +				 * Make an immediate copy in ORQ to be ready
> >> +				 * to process loopback READ reply
> >> +				 */
> >> +				siw_read_to_orq(rreq, &wqe->sqe);
> >> +				qp->orq_put++;
> >> +			} else {
> >> +				qp->tx_ctx.orq_fence = 1;
> >> +				rv = 0;
> >> +			}
> >> +			spin_unlock(&qp->orq_lock);
> >> +		}
> >> +
> >> +		/* Clear SQE, can be re-used by application */
> >> +		smp_store_mb(sqe->flags, 0);
> >> +		qp->sq_get++;
> >> +	} else {
> >> +		rv = 0;
> >> +	}
> >> +out:
> >> +	if (unlikely(rv < 0)) {
> >> +		siw_dbg_qp(qp, "error %d\n", rv);
> >> +		wqe->wr_status = SIW_WR_IDLE;
> >> +	}
> >> +	return rv;
> >> +}
> >> +
> >> +/*
> >> + * Check if current CQ state qualifies for
> >> + * calling CQ completion handler. Must be
> >> + * called with CQ lock held.
> >> + */
> >> +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
> >> +{
> >> +	u64 cq_notify;
> >> +
> >> +	if (!cq->base_cq.comp_handler)
> >> +		return false;
> >> +
> >> +	cq_notify = READ_ONCE(*cq->notify);
> >> +
> >> +	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
> >> +	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
> >> +	     (flags & SIW_WQE_SOLICITED))) {
> >> +		/* dis-arm CQ */
> >> +		smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
> >> +
> >> +		return true;
> >> +	}
> >> +	return false;
> >> +}
> >> +
> >> +/* Must be called without holding CQ lock */
> >> +static inline void siw_cq_completion(struct siw_cq *cq)
> >> +{
> >> +	siw_dbg_obj(cq, "Completion\n");
> >> +	(*cq->base_cq.comp_handler)(&cq->base_cq,
> >cq->base_cq.cq_context);
> >> +}
> >> +
> >> +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32
> >bytes,
> >> +		     enum siw_wc_status status)
> >> +{
> >> +	struct siw_cq *cq = qp->scq;
> >> +	int rv = 0;
> >> +
> >> +	if (cq) {
> >> +		u32 sqe_flags = sqe->flags;
> >> +		struct siw_cqe *cqe;
> >> +		u32 idx;
> >> +		unsigned long flags;
> >> +
> >> +		spin_lock_irqsave(&cq->lock, flags);
> >> +
> >> +		idx = cq->cq_put % cq->num_cqe;
> >> +		cqe = &cq->queue[idx];
> >> +
> >> +		if (!READ_ONCE(cqe->flags)) {
> >> +			bool notify;
> >> +
> >> +			cqe->id = sqe->id;
> >> +			cqe->opcode = sqe->opcode;
> >> +			cqe->status = status;
> >> +			cqe->imm_data = 0;
> >> +			cqe->bytes = bytes;
> >> +
> >> +			if (cq->kernel_verbs) {
> >
> >kernel_verbs is managed by Ib/core, why should driver know about it?
> >
> User land CQE's carry the corresponding QP ID, kernel clients expect a QP
> pointer here. That's where the difference comes from. This distinction
> between kernel and user clients is needed in more places, e.g. since the
> user land's CQE array is memory mapped, where the kernel land's is not.

It is passed through udata, your code should check existence of "udata"
and not manages user/kernel flag.

Thanks

>
>
>
Bernard Metzler Feb. 27, 2019, 2:59 p.m. UTC | #4
-----"Leon Romanovsky" <leon@kernel.org> wrote: -----

>To: "Bernard Metzler" <BMT@zurich.ibm.com>
>From: "Leon Romanovsky" <leon@kernel.org>
>Date: 02/27/2019 01:49PM
>Cc: linux-rdma@vger.kernel.org
>Subject: Re: [PATCH v5 08/13] SIW queue pair methods
>
>On Wed, Feb 27, 2019 at 11:54:07AM +0000, Bernard Metzler wrote:
>> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>>
>> >To: "Bernard Metzler" <bmt@zurich.ibm.com>
>> >From: "Leon Romanovsky" <leon@kernel.org>
>> >Date: 02/24/2019 02:19PM
>> >Cc: linux-rdma@vger.kernel.org
>> >Subject: Re: [PATCH v5 08/13] SIW queue pair methods
>> >
>> >On Tue, Feb 19, 2019 at 11:08:58AM +0100, Bernard Metzler wrote:
>> >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
>> >> ---
>> >>  drivers/infiniband/sw/siw/siw_qp.c | 1478
>> >++++++++++++++++++++++++++++
>> >>  1 file changed, 1478 insertions(+)
>> >>  create mode 100644 drivers/infiniband/sw/siw/siw_qp.c
>> >>
>> >> diff --git a/drivers/infiniband/sw/siw/siw_qp.c
>> >b/drivers/infiniband/sw/siw/siw_qp.c
>> >> new file mode 100644
>> >> index 000000000000..75fd151dae39
>> >> --- /dev/null
>> >> +++ b/drivers/infiniband/sw/siw/siw_qp.c
>> >> @@ -0,0 +1,1478 @@
>> >> +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
>> >> +/*
>> >> + * Software iWARP device driver
>> >> + *
>> >> + * Authors: Bernard Metzler <bmt@zurich.ibm.com>
>> >> + *
>> >> + * Copyright (c) 2008-2018, IBM Corporation
>> >> + *
>> >> + * This software is available to you under a choice of one of
>two
>> >> + * licenses. You may choose to be licensed under the terms of
>the
>> >GNU
>> >> + * General Public License (GPL) Version 2, available from the
>file
>> >> + * COPYING in the main directory of this source tree, or the
>> >> + * BSD license below:
>> >> + *
>> >> + *   Redistribution and use in source and binary forms, with or
>> >> + *   without modification, are permitted provided that the
>> >following
>> >> + *   conditions are met:
>> >> + *
>> >> + *   - Redistributions of source code must retain the above
>> >copyright notice,
>> >> + *     this list of conditions and the following disclaimer.
>> >> + *
>> >> + *   - Redistributions in binary form must reproduce the above
>> >copyright
>> >> + *     notice, this list of conditions and the following
>> >disclaimer in the
>> >> + *     documentation and/or other materials provided with the
>> >distribution.
>> >> + *
>> >> + *   - Neither the name of IBM nor the names of its
>contributors
>> >may be
>> >> + *     used to endorse or promote products derived from this
>> >software without
>> >> + *     specific prior written permission.
>> >> + *
>> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
>KIND,
>> >> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
>WARRANTIES
>> >OF
>> >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
>> >> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
>> >HOLDERS
>> >> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
>IN
>> >AN
>> >> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
>OR
>> >IN
>> >> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>> >THE
>> >> + * SOFTWARE.
>> >> + */
>> >> +
>> >> +#include <linux/errno.h>
>> >> +#include <linux/types.h>
>> >> +#include <linux/net.h>
>> >> +#include <linux/file.h>
>> >> +#include <linux/scatterlist.h>
>> >> +#include <linux/highmem.h>
>> >> +#include <linux/vmalloc.h>
>> >> +#include <asm/barrier.h>
>> >> +#include <net/sock.h>
>> >> +#include <net/tcp_states.h>
>> >> +#include <net/tcp.h>
>> >> +
>> >> +#include <rdma/iw_cm.h>
>> >> +#include <rdma/ib_verbs.h>
>> >> +#include <rdma/ib_smi.h>
>> >> +#include <rdma/ib_user_verbs.h>
>> >> +
>> >> +#include "siw.h"
>> >> +#include "siw_obj.h"
>> >> +#include "siw_cm.h"
>> >> +
>> >> +static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof
>> >"TERMINATE"] = {
>> >> +	[SIW_QP_STATE_IDLE]		= "IDLE",
>> >> +	[SIW_QP_STATE_RTR]		= "RTR",
>> >> +	[SIW_QP_STATE_RTS]		= "RTS",
>> >> +	[SIW_QP_STATE_CLOSING]		= "CLOSING",
>> >> +	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
>> >> +	[SIW_QP_STATE_ERROR]		= "ERROR"
>> >> +};
>> >> +
>> >> +/*
>> >> + * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp
>> >settings on a
>> >> + * per-RDMAP message basis. Please keep order of initializer.
>All
>> >MPA len
>> >> + * is initialized to minimum packet size.
>> >> + */
>> >> +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { {
>> >> +	/* RDMAP_RDMA_WRITE */
>> >> +	.hdr_len = sizeof(struct iwarp_rdma_write),
>> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
>> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
>> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> +		| cpu_to_be16(RDMAP_RDMA_WRITE),
>> >> +	.proc_data = siw_proc_write
>> >> +},
>> >> +{	/* RDMAP_RDMA_READ_REQ */
>> >> +	.hdr_len = sizeof(struct iwarp_rdma_rreq),
>> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
>> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> +		| cpu_to_be16(RDMAP_RDMA_READ_REQ),
>> >> +	.proc_data = siw_proc_rreq
>> >> +},
>> >> +{	/* RDMAP_RDMA_READ_RESP */
>> >> +	.hdr_len = sizeof(struct iwarp_rdma_rresp),
>> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
>> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
>> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> +		| cpu_to_be16(RDMAP_RDMA_READ_RESP),
>> >> +	.proc_data = siw_proc_rresp
>> >> +},
>> >> +{	/* RDMAP_SEND */
>> >> +	.hdr_len = sizeof(struct iwarp_send),
>> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
>> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> +		| cpu_to_be16(RDMAP_SEND),
>> >> +	.proc_data = siw_proc_send
>> >> +},
>> >> +{	/* RDMAP_SEND_INVAL */
>> >> +	.hdr_len = sizeof(struct iwarp_send_inv),
>> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
>> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> +		| cpu_to_be16(RDMAP_SEND_INVAL),
>> >> +	.proc_data = siw_proc_send
>> >> +},
>> >> +{	/* RDMAP_SEND_SE */
>> >> +	.hdr_len = sizeof(struct iwarp_send),
>> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
>> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> +		| cpu_to_be16(RDMAP_SEND_SE),
>> >> +	.proc_data = siw_proc_send
>> >> +},
>> >> +{	/* RDMAP_SEND_SE_INVAL */
>> >> +	.hdr_len = sizeof(struct iwarp_send_inv),
>> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
>> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> +		| cpu_to_be16(RDMAP_SEND_SE_INVAL),
>> >> +	.proc_data = siw_proc_send
>> >> +},
>> >> +{	/* RDMAP_TERMINATE */
>> >> +	.hdr_len = sizeof(struct iwarp_terminate),
>> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
>> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> +		| cpu_to_be16(RDMAP_TERMINATE),
>> >> +	.proc_data = siw_proc_terminate
>> >> +} };
>> >> +
>> >> +void siw_qp_llp_data_ready(struct sock *sk)
>> >> +{
>> >> +	struct siw_qp		*qp;
>> >> +
>> >> +	read_lock(&sk->sk_callback_lock);
>> >> +
>> >> +	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
>> >> +		goto done;
>> >> +
>> >> +	qp = sk_to_qp(sk);
>> >> +
>> >> +	if (likely(!qp->rx_ctx.rx_suspend &&
>> >> +		   down_read_trylock(&qp->state_lock))) {
>> >> +		read_descriptor_t rd_desc = {.arg.data = qp, .count = 1};
>> >> +
>> >> +		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
>> >> +			/*
>> >> +			 * Implements data receive operation during
>> >> +			 * socket callback. TCP gracefully catches
>> >> +			 * the case where there is nothing to receive
>> >> +			 * (not calling siw_tcp_rx_data() then).
>> >> +			 */
>> >> +			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
>> >> +
>> >> +		up_read(&qp->state_lock);
>> >> +	} else {
>> >> +		siw_dbg_qp(qp, "unable to rx, suspend: %d\n",
>> >> +			   qp->rx_ctx.rx_suspend);
>> >> +	}
>> >> +done:
>> >> +	read_unlock(&sk->sk_callback_lock);
>> >> +}
>> >> +
>> >> +void siw_qp_llp_close(struct siw_qp *qp)
>> >> +{
>> >> +	siw_dbg_qp(qp, "enter llp close, state = %s\n",
>> >> +		   siw_qp_state_to_string[qp->attrs.state]);
>> >> +
>> >> +	down_write(&qp->state_lock);
>> >> +
>> >> +	qp->rx_ctx.rx_suspend = 1;
>> >> +	qp->tx_ctx.tx_suspend = 1;
>> >> +	qp->attrs.sk = NULL;
>> >> +
>> >> +	switch (qp->attrs.state) {
>> >> +
>> >> +	case SIW_QP_STATE_RTS:
>> >> +	case SIW_QP_STATE_RTR:
>> >> +	case SIW_QP_STATE_IDLE:
>> >> +	case SIW_QP_STATE_TERMINATE:
>> >> +
>> >> +		qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> +
>> >> +		break;
>> >> +	/*
>> >> +	 * SIW_QP_STATE_CLOSING:
>> >> +	 *
>> >> +	 * This is a forced close. shall the QP be moved to
>> >> +	 * ERROR or IDLE ?
>> >> +	 */
>> >> +	case SIW_QP_STATE_CLOSING:
>> >> +		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
>> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> +		else
>> >> +			qp->attrs.state = SIW_QP_STATE_IDLE;
>> >> +
>> >> +		break;
>> >> +
>> >> +	default:
>> >> +		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
>> >> +			   siw_qp_state_to_string[qp->attrs.state]);
>> >> +		break;
>> >> +	}
>> >> +	siw_sq_flush(qp);
>> >> +	siw_rq_flush(qp);
>> >> +
>> >> +	/*
>> >> +	 * Dereference closing CEP
>> >> +	 */
>> >> +	if (qp->cep) {
>> >> +		siw_cep_put(qp->cep);
>> >> +		qp->cep = NULL;
>> >> +	}
>> >> +
>> >> +	up_write(&qp->state_lock);
>> >> +
>> >> +	siw_dbg_qp(qp, "llp close exit: state %s\n",
>> >> +		   siw_qp_state_to_string[qp->attrs.state]);
>> >> +}
>> >> +
>> >> +/*
>> >> + * socket callback routine informing about newly available send
>> >space.
>> >> + * Function schedules SQ work for processing SQ items.
>> >> + */
>> >> +void siw_qp_llp_write_space(struct sock *sk)
>> >> +{
>> >> +	struct siw_cep	*cep = sk_to_cep(sk);
>> >> +
>> >> +	cep->sk_write_space(sk);
>> >> +
>> >> +	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
>> >> +		(void) siw_sq_start(cep->qp);
>> >> +}
>> >> +
>> >> +static int siw_qp_readq_init(struct siw_qp *qp, int irq_size,
>int
>> >orq_size)
>> >> +{
>> >> +	if (!irq_size)
>> >> +		irq_size = 1;
>> >> +	if (!orq_size)
>> >> +		orq_size = 1;
>> >> +
>> >> +	qp->attrs.irq_size = irq_size;
>> >> +	qp->attrs.orq_size = orq_size;
>> >> +
>> >> +	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
>> >> +	if (!qp->irq) {
>> >> +		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
>> >> +		qp->attrs.irq_size = 0;
>> >> +		return -ENOMEM;
>> >> +	}
>> >> +	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
>> >> +	if (!qp->orq) {
>> >> +		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
>> >> +		qp->attrs.orq_size = 0;
>> >> +		qp->attrs.irq_size = 0;
>> >> +		vfree(qp->irq);
>> >> +		return -ENOMEM;
>> >> +	}
>> >> +	return 0;
>> >> +}
>> >> +
>> >> +static int siw_qp_enable_crc(struct siw_qp *qp)
>> >> +{
>> >> +	struct siw_iwarp_rx *c_rx = &qp->rx_ctx;
>> >> +	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
>> >> +	int rv = 0;
>> >> +
>> >> +	if (siw_crypto_shash == NULL) {
>> >> +		rv = -ENOENT;
>> >> +		goto error;
>> >> +	}
>> >> +	c_tx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
>> >> +				   crypto_shash_descsize(siw_crypto_shash),
>> >> +				   GFP_KERNEL);
>> >> +	c_rx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
>> >> +				   crypto_shash_descsize(siw_crypto_shash),
>> >> +				   GFP_KERNEL);
>> >> +	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
>> >> +		rv = -ENOMEM;
>> >> +		goto error;
>> >> +	}
>> >> +	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
>> >> +	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
>> >> +
>> >> +	return 0;
>> >> +error:
>> >> +	siw_dbg_qp(qp, "falied loading crc32c. error %d\n", rv);
>> >> +
>> >> +	kfree(c_tx->mpa_crc_hd);
>> >> +	kfree(c_rx->mpa_crc_hd);
>> >> +
>> >> +	c_tx->mpa_crc_hd = c_rx->mpa_crc_hd = NULL;
>> >> +
>> >> +	return rv;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Send a non signalled READ or WRITE to peer side as
>negotiated
>> >> + * with MPAv2 P2P setup protocol. The work request is only
>created
>> >> + * as a current active WR and does not consume Send Queue
>space.
>> >> + *
>> >> + * Caller must hold QP state lock.
>> >> + */
>> >> +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
>> >> +{
>> >> +	struct siw_wqe	*wqe = tx_wqe(qp);
>> >> +	unsigned long flags;
>> >> +	int rv = 0;
>> >> +
>> >> +	spin_lock_irqsave(&qp->sq_lock, flags);
>> >> +
>> >> +	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
>> >> +		spin_unlock_irqrestore(&qp->sq_lock, flags);
>> >> +		return -EIO;
>> >> +	}
>> >> +	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> >> +
>> >> +	wqe->wr_status = SIW_WR_QUEUED;
>> >> +	wqe->sqe.flags = 0;
>> >> +	wqe->sqe.num_sge = 1;
>> >> +	wqe->sqe.sge[0].length = 0;
>> >> +	wqe->sqe.sge[0].laddr = 0;
>> >> +	wqe->sqe.sge[0].lkey = 0;
>> >> +	/*
>> >> +	 * While it must not be checked for inbound zero length
>> >> +	 * READ/WRITE, some HW may treat STag 0 special.
>> >> +	 */
>> >> +	wqe->sqe.rkey = 1;
>> >> +	wqe->sqe.raddr = 0;
>> >> +	wqe->processed = 0;
>> >> +
>> >> +	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
>> >> +		wqe->sqe.opcode = SIW_OP_WRITE;
>> >> +	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
>> >> +		struct siw_sqe	*rreq;
>> >> +
>> >> +		wqe->sqe.opcode = SIW_OP_READ;
>> >> +
>> >> +		spin_lock(&qp->orq_lock);
>> >> +
>> >> +		rreq = orq_get_free(qp);
>> >> +		if (rreq) {
>> >> +			siw_read_to_orq(rreq, &wqe->sqe);
>> >> +			qp->orq_put++;
>> >> +		} else
>> >> +			rv = -EIO;
>> >> +
>> >> +		spin_unlock(&qp->orq_lock);
>> >> +	} else
>> >> +		rv = -EINVAL;
>> >> +
>> >> +	if (rv)
>> >> +		wqe->wr_status = SIW_WR_IDLE;
>> >> +
>> >> +	spin_unlock_irqrestore(&qp->sq_lock, flags);
>> >> +
>> >> +	if (!rv)
>> >> +		rv = siw_sq_start(qp);
>> >> +
>> >> +	return rv;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Map memory access error to DDP tagged error
>> >> + */
>> >> +enum ddp_ecode siw_tagged_error(enum siw_access_state state)
>> >> +{
>> >> +	if (state == E_STAG_INVALID)
>> >> +		return DDP_ECODE_T_INVALID_STAG;
>> >> +	if (state == E_BASE_BOUNDS)
>> >> +		return DDP_ECODE_T_BASE_BOUNDS;
>> >> +	if (state == E_PD_MISMATCH)
>> >> +		return DDP_ECODE_T_STAG_NOT_ASSOC;
>> >> +	if (state == E_ACCESS_PERM)
>> >> +		/*
>> >> +		 * RFC 5041 (DDP) lacks an ecode for insufficient access
>> >> +		 * permissions. 'Invalid STag' seem to be the closest
>> >> +		 * match though.
>> >> +		 */
>> >> +		return DDP_ECODE_T_INVALID_STAG;
>> >> +
>> >> +	WARN_ON(1);
>> >> +
>> >> +	return DDP_ECODE_T_INVALID_STAG;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Map memory access error to RDMAP protection error
>> >> + */
>> >> +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
>> >> +{
>> >> +	if (state == E_STAG_INVALID)
>> >> +		return RDMAP_ECODE_INVALID_STAG;
>> >> +	if (state == E_BASE_BOUNDS)
>> >> +		return RDMAP_ECODE_BASE_BOUNDS;
>> >> +	if (state == E_PD_MISMATCH)
>> >> +		return RDMAP_ECODE_STAG_NOT_ASSOC;
>> >> +	if (state == E_ACCESS_PERM)
>> >> +		return RDMAP_ECODE_ACCESS_RIGHTS;
>> >> +
>> >> +	return RDMAP_ECODE_UNSPECIFIED;
>> >> +}
>> >> +
>> >> +void siw_init_terminate(struct siw_qp *qp, enum term_elayer
>layer,
>> >> +			u8 etype, u8 ecode, int in_tx)
>> >> +{
>> >> +	if (!qp->term_info.valid) {
>> >> +		memset(&qp->term_info, 0, sizeof(qp->term_info));
>> >> +		qp->term_info.layer = layer;
>> >> +		qp->term_info.etype = etype;
>> >> +		qp->term_info.ecode = ecode;
>> >> +		qp->term_info.in_tx = in_tx;
>> >> +		qp->term_info.valid = 1;
>> >> +	}
>> >> +	siw_dbg_qp(qp,
>> >> +		   "init TERM: layer %d, type %d, code %d, in tx %s\n",
>> >> +		   layer, etype, ecode, in_tx ? "yes" : "no");
>> >> +}
>> >> +
>> >> +/*
>> >> + * Send a TERMINATE message, as defined in RFC's
>> >5040/5041/5044/6581.
>> >> + * Sending TERMINATE messages is best effort - such messages
>> >> + * can only be send if the QP is still connected and it does
>> >> + * not have another outbound message in-progress, i.e. the
>> >> + * TERMINATE message must not interfer with an incomplete
>current
>> >> + * transmit operation.
>> >> + */
>> >> +void siw_send_terminate(struct siw_qp *qp)
>> >> +{
>> >> +	struct kvec		iov[3];
>> >> +	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT|MSG_EOR};
>> >> +	struct iwarp_terminate	*term = NULL;
>> >> +	union iwarp_hdr		*err_hdr = NULL;
>> >> +	struct socket		*s = qp->attrs.sk;
>> >> +	struct siw_iwarp_rx	*rx_ctx = &qp->rx_ctx;
>> >> +	union iwarp_hdr		*rx_hdr = &rx_ctx->hdr;
>> >> +	u32 crc = 0;
>> >> +	int num_frags, len_terminate, rv;
>> >> +
>> >> +	if (!qp->term_info.valid)
>> >> +		return;
>> >> +
>> >> +	qp->term_info.valid = 0;
>> >> +
>> >> +	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
>> >> +		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
>> >> +			   tx_type(tx_wqe(qp)));
>> >> +		return;
>> >> +	}
>> >> +	if (!s && qp->cep)
>> >> +		/* QP not yet in RTS. Take socket from connection end point
>*/
>> >> +		s = qp->cep->llp.sock;
>> >> +
>> >> +	if (!s) {
>> >> +		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
>> >> +		return;
>> >> +	}
>> >> +
>> >> +	term = kzalloc(sizeof(*term), GFP_KERNEL);
>> >> +	if (!term)
>> >> +		return;
>> >> +
>> >> +	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
>> >> +	term->ddp_mo = 0;
>> >> +	term->ddp_msn = cpu_to_be32(1);
>> >> +
>> >> +	iov[0].iov_base = term;
>> >> +	iov[0].iov_len = sizeof(*term);
>> >> +
>> >> +	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
>> >> +	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
>> >> +	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
>> >> +		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
>> >> +		if (!err_hdr) {
>> >> +			kfree(term);
>> >> +			return;
>> >> +		}
>> >> +	}
>> >> +	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
>> >> +	       sizeof(struct iwarp_ctrl));
>> >> +
>> >> +	__rdmap_term_set_layer(term, qp->term_info.layer);
>> >> +	__rdmap_term_set_etype(term, qp->term_info.etype);
>> >> +	__rdmap_term_set_ecode(term, qp->term_info.ecode);
>> >> +
>> >> +	switch (qp->term_info.layer) {
>> >> +
>> >> +	case TERM_ERROR_LAYER_RDMAP:
>> >> +		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
>> >> +			/* No additional DDP/RDMAP header to be included */
>> >> +			break;
>> >> +
>> >> +		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
>> >> +			/*
>> >> +			 * Complete RDMAP frame will get attached, and
>> >> +			 * DDP segment length is valid
>> >> +			 */
>> >> +			term->flag_m = 1;
>> >> +			term->flag_d = 1;
>> >> +			term->flag_r = 1;
>> >> +
>> >> +			if (qp->term_info.in_tx) {
>> >> +				struct iwarp_rdma_rreq *rreq;
>> >> +				struct siw_wqe *wqe = tx_wqe(qp);
>> >> +
>> >> +				/* Inbound RREQ error, detected during
>> >> +				 * RRESP creation. Take state from
>> >> +				 * current TX work queue element to
>> >> +				 * reconstruct peers RREQ.
>> >> +				 */
>> >> +				rreq = (struct iwarp_rdma_rreq *)err_hdr;
>> >> +
>> >> +				memcpy(&rreq->ctrl,
>> >> +				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
>> >> +				       sizeof(struct iwarp_ctrl));
>> >> +
>> >> +				rreq->rsvd = 0;
>> >> +				rreq->ddp_qn =
>> >> +					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
>> >> +
>> >> +				/* Provide RREQ's MSN as kept aside */
>> >> +				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
>> >> +
>> >> +				rreq->ddp_mo = htonl(wqe->processed);
>> >> +				rreq->sink_stag = htonl(wqe->sqe.rkey);
>> >> +				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
>> >> +				rreq->read_size = htonl(wqe->sqe.sge[0].length);
>> >> +				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
>> >> +				rreq->source_to =
>> >> +					cpu_to_be64(wqe->sqe.sge[0].laddr);
>> >> +
>> >> +				iov[1].iov_base = rreq;
>> >> +				iov[1].iov_len = sizeof(*rreq);
>> >> +
>> >> +				rx_hdr = (union iwarp_hdr *)rreq;
>> >> +			} else {
>> >> +				/* Take RDMAP/DDP information from
>> >> +				 * current (failed) inbound frame.
>> >> +				 */
>> >> +				iov[1].iov_base = rx_hdr;
>> >> +
>> >> +				if (__rdmap_opcode(&rx_hdr->ctrl) ==
>> >> +				    RDMAP_RDMA_READ_REQ)
>> >> +					iov[1].iov_len =
>> >> +						sizeof(struct iwarp_rdma_rreq);
>> >> +				else /* SEND type */
>> >> +					iov[1].iov_len =
>> >> +						sizeof(struct iwarp_send);
>> >> +			}
>> >> +		} else {
>> >> +			/* Do not report DDP hdr information if packet
>> >> +			 * layout is unknown
>> >> +			 */
>> >> +			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
>> >> +			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
>> >> +				break;
>> >> +
>> >> +			iov[1].iov_base = rx_hdr;
>> >> +
>> >> +			/* Only DDP frame will get attached */
>> >> +			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
>> >> +				iov[1].iov_len =
>> >> +					sizeof(struct iwarp_rdma_write);
>> >> +			else
>> >> +				iov[1].iov_len = sizeof(struct iwarp_send);
>> >> +
>> >> +			term->flag_m = 1;
>> >> +			term->flag_d = 1;
>> >> +		}
>> >> +		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
>> >> +
>> >> +		break;
>> >> +
>> >> +	case TERM_ERROR_LAYER_DDP:
>> >> +		/* Report error encountered while DDP processing.
>> >> +		 * This can only happen as a result of inbound
>> >> +		 * DDP processing
>> >> +		 */
>> >> +
>> >> +		/* Do not report DDP hdr information if packet
>> >> +		 * layout is unknown
>> >> +		 */
>> >> +		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
>> >> +		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
>> >> +		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
>> >> +		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
>> >> +			break;
>> >> +
>> >> +		iov[1].iov_base = rx_hdr;
>> >> +
>> >> +		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
>> >> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
>> >> +		else
>> >> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
>> >> +
>> >> +		term->flag_m = 1;
>> >> +		term->flag_d = 1;
>> >> +
>> >> +		break;
>> >> +
>> >> +	default:
>> >> +		break;
>> >> +
>> >> +	}
>> >> +	if (term->flag_m || term->flag_d || term->flag_r) {
>> >> +		iov[2].iov_base = &crc;
>> >> +		iov[2].iov_len = sizeof(crc);
>> >> +		len_terminate = sizeof(*term) + iov[1].iov_len +
>MPA_CRC_SIZE;
>> >> +		num_frags = 3;
>> >> +	} else {
>> >> +		iov[1].iov_base = &crc;
>> >> +		iov[1].iov_len = sizeof(crc);
>> >> +		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
>> >> +		num_frags = 2;
>> >> +	}
>> >> +
>> >> +	/* Adjust DDP Segment Length parameter, if valid */
>> >> +	if (term->flag_m) {
>> >> +		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
>> >> +		enum rdma_opcode op = __rdmap_opcode(&rx_hdr->ctrl);
>> >> +
>> >> +		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
>> >> +		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
>> >> +	}
>> >> +
>> >> +	term->ctrl.mpa_len = cpu_to_be16(len_terminate -
>> >> +					 (MPA_HDR_SIZE + MPA_CRC_SIZE));
>> >> +	if (qp->tx_ctx.mpa_crc_hd) {
>> >> +		crypto_shash_init(rx_ctx->mpa_crc_hd);
>> >> +		if (siw_crc_array(rx_ctx->mpa_crc_hd, (u8 *)iov[0].iov_base,
>> >> +				  iov[0].iov_len))
>> >> +			goto out;
>> >> +
>> >> +		if (num_frags == 3) {
>> >> +			if (siw_crc_array(rx_ctx->mpa_crc_hd,
>> >> +					  (u8 *)iov[1].iov_base,
>> >> +					  iov[1].iov_len))
>> >> +				goto out;
>> >> +		}
>> >> +		crypto_shash_final(rx_ctx->mpa_crc_hd, (u8 *)&crc);
>> >> +	}
>> >> +
>> >> +	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
>> >> +	siw_dbg_qp(qp,
>> >> +		   "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
>> >> +		   rv == len_terminate ? "success" : "failure",
>> >> +		   __rdmap_term_layer(term), __rdmap_term_etype(term),
>> >> +		   __rdmap_term_ecode(term), rv);
>> >> +out:
>> >> +	kfree(term);
>> >> +	kfree(err_hdr);
>> >> +}
>> >> +
>> >> +/*
>> >> + * handle all attrs other than state
>> >> + */
>> >> +static void siw_qp_modify_nonstate(struct siw_qp *qp,
>> >> +				   struct siw_qp_attrs *attrs,
>> >> +				   enum siw_qp_attr_mask mask)
>> >> +{
>> >> +	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
>> >> +		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
>> >> +			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
>> >> +		else
>> >> +			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
>> >> +
>> >> +		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
>> >> +			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
>> >> +		else
>> >> +			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
>> >> +
>> >> +		if (attrs->flags & SIW_RDMA_READ_ENABLED)
>> >> +			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
>> >> +		else
>> >> +			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
>> >> +	}
>> >> +}
>> >> +
>> >> +/*
>> >> + * caller holds qp->state_lock
>> >> + */
>> >> +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs
>*attrs,
>> >> +		  enum siw_qp_attr_mask mask)
>> >> +{
>> >> +	int	drop_conn = 0, rv = 0;
>> >> +
>> >> +	if (!mask)
>> >> +		return 0;
>> >> +
>> >> +	siw_dbg_qp(qp, "state: %s => %s\n",
>> >> +		   siw_qp_state_to_string[qp->attrs.state],
>> >> +		   siw_qp_state_to_string[attrs->state]);
>> >> +
>> >> +	if (mask != SIW_QP_ATTR_STATE)
>> >> +		siw_qp_modify_nonstate(qp, attrs, mask);
>> >> +
>> >> +	if (!(mask & SIW_QP_ATTR_STATE))
>> >> +		return 0;
>> >> +
>> >> +	switch (qp->attrs.state) {
>> >> +
>> >> +	case SIW_QP_STATE_IDLE:
>> >> +	case SIW_QP_STATE_RTR:
>> >> +
>> >> +		switch (attrs->state) {
>> >> +
>> >> +		case SIW_QP_STATE_RTS:
>> >> +
>> >> +			if (attrs->flags & SIW_MPA_CRC) {
>> >> +				rv = siw_qp_enable_crc(qp);
>> >> +				if (rv)
>> >> +					break;
>> >> +			}
>> >> +			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
>> >> +				siw_dbg_qp(qp, "no socket\n");
>> >> +				rv = -EINVAL;
>> >> +				break;
>> >> +			}
>> >> +			if (!(mask & SIW_QP_ATTR_MPA)) {
>> >> +				siw_dbg_qp(qp, "no MPA\n");
>> >> +				rv = -EINVAL;
>> >> +				break;
>> >> +			}
>> >> +			siw_dbg_qp(qp, "enter rts, peer 0x%08x, loc 0x%08x\n",
>> >> +				   qp->cep->llp.raddr.sin_addr.s_addr,
>> >> +				   qp->cep->llp.laddr.sin_addr.s_addr);
>> >> +			/*
>> >> +			 * Initialize iWARP TX state
>> >> +			 */
>> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
>> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
>> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
>> >> +
>> >> +			/*
>> >> +			 * Initialize iWARP RX state
>> >> +			 */
>> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
>> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
>> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
>> >> +
>> >> +			/*
>> >> +			 * init IRD free queue, caller has already checked
>> >> +			 * limits.
>> >> +			 */
>> >> +			rv = siw_qp_readq_init(qp, attrs->irq_size,
>> >> +					       attrs->orq_size);
>> >> +			if (rv)
>> >> +				break;
>> >> +
>> >> +			qp->attrs.sk = attrs->sk;
>> >> +			qp->attrs.state = SIW_QP_STATE_RTS;
>> >> +
>> >> +			break;
>> >> +
>> >> +		case SIW_QP_STATE_ERROR:
>> >> +			siw_rq_flush(qp);
>> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> +			if (qp->cep) {
>> >> +				siw_cep_put(qp->cep);
>> >> +				qp->cep = NULL;
>> >> +			}
>> >> +			break;
>> >> +
>> >> +		case SIW_QP_STATE_RTR:
>> >> +			/* ignore */
>> >> +			break;
>> >> +
>> >> +		default:
>> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> >> +				   siw_qp_state_to_string[qp->attrs.state],
>> >> +				   siw_qp_state_to_string[attrs->state]);
>> >> +			break;
>> >> +		}
>> >> +		break;
>> >> +
>> >> +	case SIW_QP_STATE_RTS:
>> >> +
>> >> +		switch (attrs->state) {
>> >> +
>> >> +		case SIW_QP_STATE_CLOSING:
>> >> +			/*
>> >> +			 * Verbs: move to IDLE if SQ and ORQ are empty.
>> >> +			 * Move to ERROR otherwise. But first of all we must
>> >> +			 * close the connection. So we keep CLOSING or ERROR
>> >> +			 * as a transient state, schedule connection drop work
>> >> +			 * and wait for the socket state change upcall to
>> >> +			 * come back closed.
>> >> +			 */
>> >> +			if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
>> >> +				qp->attrs.state = SIW_QP_STATE_CLOSING;
>> >> +			} else {
>> >> +				qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> +				siw_sq_flush(qp);
>> >> +			}
>> >> +			siw_rq_flush(qp);
>> >> +
>> >> +			drop_conn = 1;
>> >> +			break;
>> >> +
>> >> +		case SIW_QP_STATE_TERMINATE:
>> >> +			qp->attrs.state = SIW_QP_STATE_TERMINATE;
>> >> +
>> >> +			siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
>> >> +					   RDMAP_ETYPE_CATASTROPHIC,
>> >> +					   RDMAP_ECODE_UNSPECIFIED, 1);
>> >> +			drop_conn = 1;
>> >> +
>> >> +			break;
>> >> +
>> >> +		case SIW_QP_STATE_ERROR:
>> >> +			/*
>> >> +			 * This is an emergency close.
>> >> +			 *
>> >> +			 * Any in progress transmit operation will get
>> >> +			 * cancelled.
>> >> +			 * This will likely result in a protocol failure,
>> >> +			 * if a TX operation is in transit. The caller
>> >> +			 * could unconditional wait to give the current
>> >> +			 * operation a chance to complete.
>> >> +			 * Esp., how to handle the non-empty IRQ case?
>> >> +			 * The peer was asking for data transfer at a valid
>> >> +			 * point in time.
>> >> +			 */
>> >> +			siw_sq_flush(qp);
>> >> +			siw_rq_flush(qp);
>> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> +			drop_conn = 1;
>> >> +
>> >> +			break;
>> >> +
>> >> +		default:
>> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> >> +				   siw_qp_state_to_string[qp->attrs.state],
>> >> +				   siw_qp_state_to_string[attrs->state]);
>> >> +			break;
>> >> +		}
>> >> +		break;
>> >> +
>> >> +	case SIW_QP_STATE_TERMINATE:
>> >> +
>> >> +		switch (attrs->state) {
>> >> +
>> >> +		case SIW_QP_STATE_ERROR:
>> >> +			siw_rq_flush(qp);
>> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> +
>> >> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
>> >> +				siw_sq_flush(qp);
>> >> +
>> >> +			break;
>> >> +
>> >> +		default:
>> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> >> +				   siw_qp_state_to_string[qp->attrs.state],
>> >> +				   siw_qp_state_to_string[attrs->state]);
>> >> +		}
>> >> +		break;
>> >> +
>> >> +	case SIW_QP_STATE_CLOSING:
>> >> +
>> >> +		switch (attrs->state) {
>> >> +
>> >> +		case SIW_QP_STATE_IDLE:
>> >> +			WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
>> >> +			qp->attrs.state = SIW_QP_STATE_IDLE;
>> >> +
>> >> +			break;
>> >> +
>> >> +		case SIW_QP_STATE_CLOSING:
>> >> +			/*
>> >> +			 * The LLP may already moved the QP to closing
>> >> +			 * due to graceful peer close init
>> >> +			 */
>> >> +			break;
>> >> +
>> >> +		case SIW_QP_STATE_ERROR:
>> >> +			/*
>> >> +			 * QP was moved to CLOSING by LLP event
>> >> +			 * not yet seen by user.
>> >> +			 */
>> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> +
>> >> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
>> >> +				siw_sq_flush(qp);
>> >> +
>> >> +			siw_rq_flush(qp);
>> >> +
>> >> +			break;
>> >> +
>> >> +		default:
>> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> >> +				   siw_qp_state_to_string[qp->attrs.state],
>> >> +				   siw_qp_state_to_string[attrs->state]);
>> >> +
>> >> +			return -ECONNABORTED;
>> >> +		}
>> >> +		break;
>> >> +
>> >> +	default:
>> >> +		siw_dbg_qp(qp, " noop: state %s\n",
>> >> +			   siw_qp_state_to_string[qp->attrs.state]);
>> >> +		break;
>> >> +	}
>> >> +	if (drop_conn)
>> >> +		siw_qp_cm_drop(qp, 0);
>> >> +
>> >> +	return rv;
>> >> +}
>> >> +
>> >> +struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int
>id)
>> >> +{
>> >> +	struct siw_qp *qp =  siw_qp_id2obj(to_siw_dev(base_dev), id);
>> >> +
>> >> +	if (qp) {
>> >> +		/*
>> >> +		 * siw_qp_id2obj() increments object reference count
>> >> +		 */
>> >> +		siw_qp_put(qp);
>> >> +		siw_dbg_qp(qp, "got base QP");
>> >> +
>> >> +		return &qp->base_qp;
>> >> +	}
>> >> +	return (struct ib_qp *)NULL;
>> >> +}
>> >> +
>> >> +/*
>> >> + * siw_check_mem()
>> >> + *
>> >> + * Check protection domain, STAG state, access permissions and
>> >> + * address range for memory object.
>> >> + *
>> >> + * @pd:		Protection Domain memory should belong to
>> >> + * @mem:	memory to be checked
>> >> + * @addr:	starting addr of mem
>> >> + * @perms:	requested access permissions
>> >> + * @len:	len of memory interval to be checked
>> >> + *
>> >> + */
>> >> +int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem, u64
>> >addr,
>> >> +		  enum siw_access_flags perms, int len)
>> >> +{
>> >> +	if (siw_mem2mr(mem)->pd != pd) {
>> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: pd mismatch\n", OBJ_ID(pd));
>> >> +		return -E_PD_MISMATCH;
>> >> +	}
>> >> +	if (!mem->stag_valid) {
>> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: stag 0x%08x invalid\n",
>> >> +			OBJ_ID(pd), OBJ_ID(mem));
>> >> +		return -E_STAG_INVALID;
>> >> +	}
>> >> +	/*
>> >> +	 * check access permissions
>> >> +	 */
>> >> +	if ((mem->perms & perms) < perms) {
>> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: permissions 0x%08x <
>0x%08x\n",
>> >> +			OBJ_ID(pd), mem->perms, perms);
>> >> +		return -E_ACCESS_PERM;
>> >> +	}
>> >> +	/*
>> >> +	 * Check address interval: we relax check to allow memory
>> >shrinked
>> >> +	 * from the start address _after_ placing or fetching len
>bytes.
>> >> +	 * TODO: this relaxation is probably overdone
>> >> +	 */
>> >> +	if (addr < mem->va || addr + len > mem->va + mem->len) {
>> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: MEM interval len %d\n",
>> >> +			OBJ_ID(pd), len);
>> >> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx) out of
>bounds\n",
>> >> +			(unsigned long long)addr,
>> >> +			(unsigned long long)(addr + len));
>> >> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx] LKey=0x%08x\n",
>> >> +			(unsigned long long)mem->va,
>> >> +			(unsigned long long)(mem->va + mem->len),
>> >> +			OBJ_ID(mem));
>> >> +
>> >> +		return -E_BASE_BOUNDS;
>> >> +	}
>> >> +	return E_ACCESS_OK;
>> >> +}
>> >> +
>> >> +/*
>> >> + * siw_check_sge()
>> >> + *
>> >> + * Check SGE for access rights in given interval
>> >> + *
>> >> + * @pd:		Protection Domain memory should belong to
>> >> + * @sge:	SGE to be checked
>> >> + * @mem:	array of memory references
>> >> + * @perms:	requested access permissions
>> >> + * @off:	starting offset in SGE
>> >> + * @len:	len of memory interval to be checked
>> >> + *
>> >> + * NOTE: Function references SGE's memory object (mem->obj)
>> >> + * if not yet done. New reference is kept if check went ok and
>> >> + * released if check failed. If mem->obj is already valid, no
>new
>> >> + * lookup is being done and mem is not released it check fails.
>> >> + */
>> >> +int
>> >> +siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
>> >> +	      struct siw_mem *mem[], enum siw_access_flags perms,
>> >> +	      u32 off, int len)
>> >> +{
>> >> +	struct siw_device *sdev = pd->hdr.sdev;
>> >> +	int new_ref = 0, rv = E_ACCESS_OK;
>> >> +
>> >> +	if (len + off > sge->length) {
>> >> +		rv = -E_BASE_BOUNDS;
>> >> +		goto fail;
>> >> +	}
>> >> +	if (*mem == NULL) {
>> >> +		*mem = siw_mem_id2obj(sdev, sge->lkey >> 8);
>> >> +		if (*mem == NULL) {
>> >> +			rv = -E_STAG_INVALID;
>> >> +			goto fail;
>> >> +		}
>> >> +		new_ref = 1;
>> >> +	}
>> >> +
>> >> +	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
>> >> +	if (rv)
>> >> +		goto fail;
>> >> +
>> >> +	return 0;
>> >> +
>> >> +fail:
>> >> +	if (new_ref) {
>> >> +		siw_mem_put(*mem);
>> >> +		*mem = NULL;
>> >> +	}
>> >> +	return rv;
>> >> +}
>> >> +
>> >> +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
>> >> +{
>> >> +	rreq->id = sqe->id;
>> >> +	rreq->opcode = sqe->opcode;
>> >> +	rreq->sge[0].laddr = sqe->sge[0].laddr;
>> >> +	rreq->sge[0].length = sqe->sge[0].length;
>> >> +	rreq->sge[0].lkey = sqe->sge[0].lkey;
>> >> +	rreq->sge[1].lkey = sqe->sge[1].lkey;
>> >> +	rreq->flags = sqe->flags | SIW_WQE_VALID;
>> >> +	rreq->num_sge = 1;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Must be called with SQ locked.
>> >> + * To avoid complete SQ starvation by constant inbound READ
>> >requests,
>> >> + * the active IRQ will not be served after qp->irq_burst, if
>the
>> >> + * SQ has pending work.
>> >> + */
>> >> +int siw_activate_tx(struct siw_qp *qp)
>> >> +{
>> >> +	struct siw_sqe	*irqe, *sqe;
>> >> +	struct siw_wqe	*wqe = tx_wqe(qp);
>> >> +	int rv = 1;
>> >> +
>> >> +	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
>> >> +
>> >> +	if (irqe->flags & SIW_WQE_VALID) {
>> >> +		sqe = sq_get_next(qp);
>> >> +
>> >> +		/*
>> >> +		 * Avoid local WQE processing starvation in case
>> >> +		 * of constant inbound READ request stream
>> >> +		 */
>> >> +		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
>> >> +			qp->irq_burst = 0;
>> >> +			goto skip_irq;
>> >> +		}
>> >> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> >> +		wqe->wr_status = SIW_WR_QUEUED;
>> >> +
>> >> +		/* start READ RESPONSE */
>> >> +		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
>> >> +		wqe->sqe.flags = 0;
>> >> +		if (irqe->num_sge) {
>> >> +			wqe->sqe.num_sge = 1;
>> >> +			wqe->sqe.sge[0].length = irqe->sge[0].length;
>> >> +			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
>> >> +			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
>> >> +		} else {
>> >> +			wqe->sqe.num_sge = 0;
>> >> +		}
>> >> +
>> >> +		/* Retain original RREQ's message sequence number for
>> >> +		 * potential error reporting cases.
>> >> +		 */
>> >> +		wqe->sqe.sge[1].length = irqe->sge[1].length;
>> >> +
>> >> +		wqe->sqe.rkey = irqe->rkey;
>> >> +		wqe->sqe.raddr = irqe->raddr;
>> >> +
>> >> +		wqe->processed = 0;
>> >> +		qp->irq_get++;
>> >> +
>> >> +		/* mark current IRQ entry free */
>> >> +		smp_store_mb(irqe->flags, 0);
>> >> +
>> >> +		goto out;
>> >> +	}
>> >> +
>> >> +	sqe = sq_get_next(qp);
>> >> +	if (sqe) {
>> >> +skip_irq:
>> >> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> >> +		wqe->wr_status = SIW_WR_QUEUED;
>> >> +
>> >> +		/* First copy SQE to kernel private memory */
>> >> +		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
>> >> +
>> >> +		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
>> >> +			rv = -EINVAL;
>> >> +			goto out;
>> >> +		}
>> >> +		if (wqe->sqe.flags & SIW_WQE_INLINE) {
>> >> +			if (wqe->sqe.opcode != SIW_OP_SEND &&
>> >> +			    wqe->sqe.opcode != SIW_OP_WRITE) {
>> >> +				rv = -EINVAL;
>> >> +				goto out;
>> >> +			}
>> >> +			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
>> >> +				rv = -EINVAL;
>> >> +				goto out;
>> >> +			}
>> >> +			wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
>> >> +			wqe->sqe.sge[0].lkey = 0;
>> >> +			wqe->sqe.num_sge = 1;
>> >> +		}
>> >> +		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
>> >> +			/* A READ cannot be fenced */
>> >> +			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
>> >> +			    wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV)) {
>> >> +				siw_dbg_qp(qp, "cannot fence read\n");
>> >> +				rv = -EINVAL;
>> >> +				goto out;
>> >> +			}
>> >> +			spin_lock(&qp->orq_lock);
>> >> +
>> >> +			if (!siw_orq_empty(qp)) {
>> >> +				qp->tx_ctx.orq_fence = 1;
>> >> +				rv = 0;
>> >> +			}
>> >> +			spin_unlock(&qp->orq_lock);
>> >> +
>> >> +		} else if (wqe->sqe.opcode == SIW_OP_READ ||
>> >> +			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
>> >> +			struct siw_sqe	*rreq;
>> >> +
>> >> +			wqe->sqe.num_sge = 1;
>> >> +
>> >> +			spin_lock(&qp->orq_lock);
>> >> +
>> >> +			rreq = orq_get_free(qp);
>> >> +			if (rreq) {
>> >> +				/*
>> >> +				 * Make an immediate copy in ORQ to be ready
>> >> +				 * to process loopback READ reply
>> >> +				 */
>> >> +				siw_read_to_orq(rreq, &wqe->sqe);
>> >> +				qp->orq_put++;
>> >> +			} else {
>> >> +				qp->tx_ctx.orq_fence = 1;
>> >> +				rv = 0;
>> >> +			}
>> >> +			spin_unlock(&qp->orq_lock);
>> >> +		}
>> >> +
>> >> +		/* Clear SQE, can be re-used by application */
>> >> +		smp_store_mb(sqe->flags, 0);
>> >> +		qp->sq_get++;
>> >> +	} else {
>> >> +		rv = 0;
>> >> +	}
>> >> +out:
>> >> +	if (unlikely(rv < 0)) {
>> >> +		siw_dbg_qp(qp, "error %d\n", rv);
>> >> +		wqe->wr_status = SIW_WR_IDLE;
>> >> +	}
>> >> +	return rv;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Check if current CQ state qualifies for
>> >> + * calling CQ completion handler. Must be
>> >> + * called with CQ lock held.
>> >> + */
>> >> +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
>> >> +{
>> >> +	u64 cq_notify;
>> >> +
>> >> +	if (!cq->base_cq.comp_handler)
>> >> +		return false;
>> >> +
>> >> +	cq_notify = READ_ONCE(*cq->notify);
>> >> +
>> >> +	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
>> >> +	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
>> >> +	     (flags & SIW_WQE_SOLICITED))) {
>> >> +		/* dis-arm CQ */
>> >> +		smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
>> >> +
>> >> +		return true;
>> >> +	}
>> >> +	return false;
>> >> +}
>> >> +
>> >> +/* Must be called without holding CQ lock */
>> >> +static inline void siw_cq_completion(struct siw_cq *cq)
>> >> +{
>> >> +	siw_dbg_obj(cq, "Completion\n");
>> >> +	(*cq->base_cq.comp_handler)(&cq->base_cq,
>> >cq->base_cq.cq_context);
>> >> +}
>> >> +
>> >> +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe,
>u32
>> >bytes,
>> >> +		     enum siw_wc_status status)
>> >> +{
>> >> +	struct siw_cq *cq = qp->scq;
>> >> +	int rv = 0;
>> >> +
>> >> +	if (cq) {
>> >> +		u32 sqe_flags = sqe->flags;
>> >> +		struct siw_cqe *cqe;
>> >> +		u32 idx;
>> >> +		unsigned long flags;
>> >> +
>> >> +		spin_lock_irqsave(&cq->lock, flags);
>> >> +
>> >> +		idx = cq->cq_put % cq->num_cqe;
>> >> +		cqe = &cq->queue[idx];
>> >> +
>> >> +		if (!READ_ONCE(cqe->flags)) {
>> >> +			bool notify;
>> >> +
>> >> +			cqe->id = sqe->id;
>> >> +			cqe->opcode = sqe->opcode;
>> >> +			cqe->status = status;
>> >> +			cqe->imm_data = 0;
>> >> +			cqe->bytes = bytes;
>> >> +
>> >> +			if (cq->kernel_verbs) {
>> >
>> >kernel_verbs is managed by Ib/core, why should driver know about
>it?
>> >
>> User land CQE's carry the corresponding QP ID, kernel clients
>expect a QP
>> pointer here. That's where the difference comes from. This
>distinction
>> between kernel and user clients is needed in more places, e.g.
>since the
>> user land's CQE array is memory mapped, where the kernel land's is
>not.
>
>It is passed through udata, your code should check existence of
>"udata"
>and not manages user/kernel flag.
>
>

Right. I simply wanted to avoid those potentially cache thrashing
efforts - the udata pointer of the ib_device is quite far away
from the CQ array. So we kind of trade redundancy for performance,
and CQE creation is definitively on the fast path...

Thanks
Bernard.
Leon Romanovsky Feb. 27, 2019, 4:01 p.m. UTC | #5
On Wed, Feb 27, 2019 at 02:59:26PM +0000, Bernard Metzler wrote:
> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>
> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
> >From: "Leon Romanovsky" <leon@kernel.org>
> >Date: 02/27/2019 01:49PM
> >Cc: linux-rdma@vger.kernel.org
> >Subject: Re: [PATCH v5 08/13] SIW queue pair methods
> >
> >On Wed, Feb 27, 2019 at 11:54:07AM +0000, Bernard Metzler wrote:
> >> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
> >>
> >> >To: "Bernard Metzler" <bmt@zurich.ibm.com>
> >> >From: "Leon Romanovsky" <leon@kernel.org>
> >> >Date: 02/24/2019 02:19PM
> >> >Cc: linux-rdma@vger.kernel.org
> >> >Subject: Re: [PATCH v5 08/13] SIW queue pair methods
> >> >
> >> >On Tue, Feb 19, 2019 at 11:08:58AM +0100, Bernard Metzler wrote:
> >> >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> >> >> ---
> >> >>  drivers/infiniband/sw/siw/siw_qp.c | 1478
> >> >++++++++++++++++++++++++++++
> >> >>  1 file changed, 1478 insertions(+)
> >> >>  create mode 100644 drivers/infiniband/sw/siw/siw_qp.c
> >> >>
> >> >> diff --git a/drivers/infiniband/sw/siw/siw_qp.c
> >> >b/drivers/infiniband/sw/siw/siw_qp.c
> >> >> new file mode 100644
> >> >> index 000000000000..75fd151dae39
> >> >> --- /dev/null
> >> >> +++ b/drivers/infiniband/sw/siw/siw_qp.c
> >> >> @@ -0,0 +1,1478 @@
> >> >> +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
> >> >> +/*
> >> >> + * Software iWARP device driver
> >> >> + *
> >> >> + * Authors: Bernard Metzler <bmt@zurich.ibm.com>
> >> >> + *
> >> >> + * Copyright (c) 2008-2018, IBM Corporation
> >> >> + *
> >> >> + * This software is available to you under a choice of one of
> >two
> >> >> + * licenses. You may choose to be licensed under the terms of
> >the
> >> >GNU
> >> >> + * General Public License (GPL) Version 2, available from the
> >file
> >> >> + * COPYING in the main directory of this source tree, or the
> >> >> + * BSD license below:
> >> >> + *
> >> >> + *   Redistribution and use in source and binary forms, with or
> >> >> + *   without modification, are permitted provided that the
> >> >following
> >> >> + *   conditions are met:
> >> >> + *
> >> >> + *   - Redistributions of source code must retain the above
> >> >copyright notice,
> >> >> + *     this list of conditions and the following disclaimer.
> >> >> + *
> >> >> + *   - Redistributions in binary form must reproduce the above
> >> >copyright
> >> >> + *     notice, this list of conditions and the following
> >> >disclaimer in the
> >> >> + *     documentation and/or other materials provided with the
> >> >distribution.
> >> >> + *
> >> >> + *   - Neither the name of IBM nor the names of its
> >contributors
> >> >may be
> >> >> + *     used to endorse or promote products derived from this
> >> >software without
> >> >> + *     specific prior written permission.
> >> >> + *
> >> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> >KIND,
> >> >> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
> >WARRANTIES
> >> >OF
> >> >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> >> >> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> >> >HOLDERS
> >> >> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
> >IN
> >> >AN
> >> >> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
> >OR
> >> >IN
> >> >> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >> >THE
> >> >> + * SOFTWARE.
> >> >> + */
> >> >> +
> >> >> +#include <linux/errno.h>
> >> >> +#include <linux/types.h>
> >> >> +#include <linux/net.h>
> >> >> +#include <linux/file.h>
> >> >> +#include <linux/scatterlist.h>
> >> >> +#include <linux/highmem.h>
> >> >> +#include <linux/vmalloc.h>
> >> >> +#include <asm/barrier.h>
> >> >> +#include <net/sock.h>
> >> >> +#include <net/tcp_states.h>
> >> >> +#include <net/tcp.h>
> >> >> +
> >> >> +#include <rdma/iw_cm.h>
> >> >> +#include <rdma/ib_verbs.h>
> >> >> +#include <rdma/ib_smi.h>
> >> >> +#include <rdma/ib_user_verbs.h>
> >> >> +
> >> >> +#include "siw.h"
> >> >> +#include "siw_obj.h"
> >> >> +#include "siw_cm.h"
> >> >> +
> >> >> +static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof
> >> >"TERMINATE"] = {
> >> >> +	[SIW_QP_STATE_IDLE]		= "IDLE",
> >> >> +	[SIW_QP_STATE_RTR]		= "RTR",
> >> >> +	[SIW_QP_STATE_RTS]		= "RTS",
> >> >> +	[SIW_QP_STATE_CLOSING]		= "CLOSING",
> >> >> +	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
> >> >> +	[SIW_QP_STATE_ERROR]		= "ERROR"
> >> >> +};
> >> >> +
> >> >> +/*
> >> >> + * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp
> >> >settings on a
> >> >> + * per-RDMAP message basis. Please keep order of initializer.
> >All
> >> >MPA len
> >> >> + * is initialized to minimum packet size.
> >> >> + */
> >> >> +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { {
> >> >> +	/* RDMAP_RDMA_WRITE */
> >> >> +	.hdr_len = sizeof(struct iwarp_rdma_write),
> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> >> +		| cpu_to_be16(RDMAP_RDMA_WRITE),
> >> >> +	.proc_data = siw_proc_write
> >> >> +},
> >> >> +{	/* RDMAP_RDMA_READ_REQ */
> >> >> +	.hdr_len = sizeof(struct iwarp_rdma_rreq),
> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> >> +		| cpu_to_be16(RDMAP_RDMA_READ_REQ),
> >> >> +	.proc_data = siw_proc_rreq
> >> >> +},
> >> >> +{	/* RDMAP_RDMA_READ_RESP */
> >> >> +	.hdr_len = sizeof(struct iwarp_rdma_rresp),
> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> >> +		| cpu_to_be16(RDMAP_RDMA_READ_RESP),
> >> >> +	.proc_data = siw_proc_rresp
> >> >> +},
> >> >> +{	/* RDMAP_SEND */
> >> >> +	.hdr_len = sizeof(struct iwarp_send),
> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> >> +		| cpu_to_be16(RDMAP_SEND),
> >> >> +	.proc_data = siw_proc_send
> >> >> +},
> >> >> +{	/* RDMAP_SEND_INVAL */
> >> >> +	.hdr_len = sizeof(struct iwarp_send_inv),
> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> >> +		| cpu_to_be16(RDMAP_SEND_INVAL),
> >> >> +	.proc_data = siw_proc_send
> >> >> +},
> >> >> +{	/* RDMAP_SEND_SE */
> >> >> +	.hdr_len = sizeof(struct iwarp_send),
> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> >> +		| cpu_to_be16(RDMAP_SEND_SE),
> >> >> +	.proc_data = siw_proc_send
> >> >> +},
> >> >> +{	/* RDMAP_SEND_SE_INVAL */
> >> >> +	.hdr_len = sizeof(struct iwarp_send_inv),
> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> >> +		| cpu_to_be16(RDMAP_SEND_SE_INVAL),
> >> >> +	.proc_data = siw_proc_send
> >> >> +},
> >> >> +{	/* RDMAP_TERMINATE */
> >> >> +	.hdr_len = sizeof(struct iwarp_terminate),
> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
> >> >> +		| cpu_to_be16(RDMAP_TERMINATE),
> >> >> +	.proc_data = siw_proc_terminate
> >> >> +} };
> >> >> +
> >> >> +void siw_qp_llp_data_ready(struct sock *sk)
> >> >> +{
> >> >> +	struct siw_qp		*qp;
> >> >> +
> >> >> +	read_lock(&sk->sk_callback_lock);
> >> >> +
> >> >> +	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
> >> >> +		goto done;
> >> >> +
> >> >> +	qp = sk_to_qp(sk);
> >> >> +
> >> >> +	if (likely(!qp->rx_ctx.rx_suspend &&
> >> >> +		   down_read_trylock(&qp->state_lock))) {
> >> >> +		read_descriptor_t rd_desc = {.arg.data = qp, .count = 1};
> >> >> +
> >> >> +		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
> >> >> +			/*
> >> >> +			 * Implements data receive operation during
> >> >> +			 * socket callback. TCP gracefully catches
> >> >> +			 * the case where there is nothing to receive
> >> >> +			 * (not calling siw_tcp_rx_data() then).
> >> >> +			 */
> >> >> +			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
> >> >> +
> >> >> +		up_read(&qp->state_lock);
> >> >> +	} else {
> >> >> +		siw_dbg_qp(qp, "unable to rx, suspend: %d\n",
> >> >> +			   qp->rx_ctx.rx_suspend);
> >> >> +	}
> >> >> +done:
> >> >> +	read_unlock(&sk->sk_callback_lock);
> >> >> +}
> >> >> +
> >> >> +void siw_qp_llp_close(struct siw_qp *qp)
> >> >> +{
> >> >> +	siw_dbg_qp(qp, "enter llp close, state = %s\n",
> >> >> +		   siw_qp_state_to_string[qp->attrs.state]);
> >> >> +
> >> >> +	down_write(&qp->state_lock);
> >> >> +
> >> >> +	qp->rx_ctx.rx_suspend = 1;
> >> >> +	qp->tx_ctx.tx_suspend = 1;
> >> >> +	qp->attrs.sk = NULL;
> >> >> +
> >> >> +	switch (qp->attrs.state) {
> >> >> +
> >> >> +	case SIW_QP_STATE_RTS:
> >> >> +	case SIW_QP_STATE_RTR:
> >> >> +	case SIW_QP_STATE_IDLE:
> >> >> +	case SIW_QP_STATE_TERMINATE:
> >> >> +
> >> >> +		qp->attrs.state = SIW_QP_STATE_ERROR;
> >> >> +
> >> >> +		break;
> >> >> +	/*
> >> >> +	 * SIW_QP_STATE_CLOSING:
> >> >> +	 *
> >> >> +	 * This is a forced close. shall the QP be moved to
> >> >> +	 * ERROR or IDLE ?
> >> >> +	 */
> >> >> +	case SIW_QP_STATE_CLOSING:
> >> >> +		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> >> +		else
> >> >> +			qp->attrs.state = SIW_QP_STATE_IDLE;
> >> >> +
> >> >> +		break;
> >> >> +
> >> >> +	default:
> >> >> +		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
> >> >> +			   siw_qp_state_to_string[qp->attrs.state]);
> >> >> +		break;
> >> >> +	}
> >> >> +	siw_sq_flush(qp);
> >> >> +	siw_rq_flush(qp);
> >> >> +
> >> >> +	/*
> >> >> +	 * Dereference closing CEP
> >> >> +	 */
> >> >> +	if (qp->cep) {
> >> >> +		siw_cep_put(qp->cep);
> >> >> +		qp->cep = NULL;
> >> >> +	}
> >> >> +
> >> >> +	up_write(&qp->state_lock);
> >> >> +
> >> >> +	siw_dbg_qp(qp, "llp close exit: state %s\n",
> >> >> +		   siw_qp_state_to_string[qp->attrs.state]);
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * socket callback routine informing about newly available send
> >> >space.
> >> >> + * Function schedules SQ work for processing SQ items.
> >> >> + */
> >> >> +void siw_qp_llp_write_space(struct sock *sk)
> >> >> +{
> >> >> +	struct siw_cep	*cep = sk_to_cep(sk);
> >> >> +
> >> >> +	cep->sk_write_space(sk);
> >> >> +
> >> >> +	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
> >> >> +		(void) siw_sq_start(cep->qp);
> >> >> +}
> >> >> +
> >> >> +static int siw_qp_readq_init(struct siw_qp *qp, int irq_size,
> >int
> >> >orq_size)
> >> >> +{
> >> >> +	if (!irq_size)
> >> >> +		irq_size = 1;
> >> >> +	if (!orq_size)
> >> >> +		orq_size = 1;
> >> >> +
> >> >> +	qp->attrs.irq_size = irq_size;
> >> >> +	qp->attrs.orq_size = orq_size;
> >> >> +
> >> >> +	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
> >> >> +	if (!qp->irq) {
> >> >> +		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
> >> >> +		qp->attrs.irq_size = 0;
> >> >> +		return -ENOMEM;
> >> >> +	}
> >> >> +	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
> >> >> +	if (!qp->orq) {
> >> >> +		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
> >> >> +		qp->attrs.orq_size = 0;
> >> >> +		qp->attrs.irq_size = 0;
> >> >> +		vfree(qp->irq);
> >> >> +		return -ENOMEM;
> >> >> +	}
> >> >> +	return 0;
> >> >> +}
> >> >> +
> >> >> +static int siw_qp_enable_crc(struct siw_qp *qp)
> >> >> +{
> >> >> +	struct siw_iwarp_rx *c_rx = &qp->rx_ctx;
> >> >> +	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
> >> >> +	int rv = 0;
> >> >> +
> >> >> +	if (siw_crypto_shash == NULL) {
> >> >> +		rv = -ENOENT;
> >> >> +		goto error;
> >> >> +	}
> >> >> +	c_tx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
> >> >> +				   crypto_shash_descsize(siw_crypto_shash),
> >> >> +				   GFP_KERNEL);
> >> >> +	c_rx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
> >> >> +				   crypto_shash_descsize(siw_crypto_shash),
> >> >> +				   GFP_KERNEL);
> >> >> +	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
> >> >> +		rv = -ENOMEM;
> >> >> +		goto error;
> >> >> +	}
> >> >> +	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
> >> >> +	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
> >> >> +
> >> >> +	return 0;
> >> >> +error:
> >> >> +	siw_dbg_qp(qp, "falied loading crc32c. error %d\n", rv);
> >> >> +
> >> >> +	kfree(c_tx->mpa_crc_hd);
> >> >> +	kfree(c_rx->mpa_crc_hd);
> >> >> +
> >> >> +	c_tx->mpa_crc_hd = c_rx->mpa_crc_hd = NULL;
> >> >> +
> >> >> +	return rv;
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * Send a non signalled READ or WRITE to peer side as
> >negotiated
> >> >> + * with MPAv2 P2P setup protocol. The work request is only
> >created
> >> >> + * as a current active WR and does not consume Send Queue
> >space.
> >> >> + *
> >> >> + * Caller must hold QP state lock.
> >> >> + */
> >> >> +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
> >> >> +{
> >> >> +	struct siw_wqe	*wqe = tx_wqe(qp);
> >> >> +	unsigned long flags;
> >> >> +	int rv = 0;
> >> >> +
> >> >> +	spin_lock_irqsave(&qp->sq_lock, flags);
> >> >> +
> >> >> +	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
> >> >> +		spin_unlock_irqrestore(&qp->sq_lock, flags);
> >> >> +		return -EIO;
> >> >> +	}
> >> >> +	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> >> >> +
> >> >> +	wqe->wr_status = SIW_WR_QUEUED;
> >> >> +	wqe->sqe.flags = 0;
> >> >> +	wqe->sqe.num_sge = 1;
> >> >> +	wqe->sqe.sge[0].length = 0;
> >> >> +	wqe->sqe.sge[0].laddr = 0;
> >> >> +	wqe->sqe.sge[0].lkey = 0;
> >> >> +	/*
> >> >> +	 * While it must not be checked for inbound zero length
> >> >> +	 * READ/WRITE, some HW may treat STag 0 special.
> >> >> +	 */
> >> >> +	wqe->sqe.rkey = 1;
> >> >> +	wqe->sqe.raddr = 0;
> >> >> +	wqe->processed = 0;
> >> >> +
> >> >> +	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
> >> >> +		wqe->sqe.opcode = SIW_OP_WRITE;
> >> >> +	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
> >> >> +		struct siw_sqe	*rreq;
> >> >> +
> >> >> +		wqe->sqe.opcode = SIW_OP_READ;
> >> >> +
> >> >> +		spin_lock(&qp->orq_lock);
> >> >> +
> >> >> +		rreq = orq_get_free(qp);
> >> >> +		if (rreq) {
> >> >> +			siw_read_to_orq(rreq, &wqe->sqe);
> >> >> +			qp->orq_put++;
> >> >> +		} else
> >> >> +			rv = -EIO;
> >> >> +
> >> >> +		spin_unlock(&qp->orq_lock);
> >> >> +	} else
> >> >> +		rv = -EINVAL;
> >> >> +
> >> >> +	if (rv)
> >> >> +		wqe->wr_status = SIW_WR_IDLE;
> >> >> +
> >> >> +	spin_unlock_irqrestore(&qp->sq_lock, flags);
> >> >> +
> >> >> +	if (!rv)
> >> >> +		rv = siw_sq_start(qp);
> >> >> +
> >> >> +	return rv;
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * Map memory access error to DDP tagged error
> >> >> + */
> >> >> +enum ddp_ecode siw_tagged_error(enum siw_access_state state)
> >> >> +{
> >> >> +	if (state == E_STAG_INVALID)
> >> >> +		return DDP_ECODE_T_INVALID_STAG;
> >> >> +	if (state == E_BASE_BOUNDS)
> >> >> +		return DDP_ECODE_T_BASE_BOUNDS;
> >> >> +	if (state == E_PD_MISMATCH)
> >> >> +		return DDP_ECODE_T_STAG_NOT_ASSOC;
> >> >> +	if (state == E_ACCESS_PERM)
> >> >> +		/*
> >> >> +		 * RFC 5041 (DDP) lacks an ecode for insufficient access
> >> >> +		 * permissions. 'Invalid STag' seem to be the closest
> >> >> +		 * match though.
> >> >> +		 */
> >> >> +		return DDP_ECODE_T_INVALID_STAG;
> >> >> +
> >> >> +	WARN_ON(1);
> >> >> +
> >> >> +	return DDP_ECODE_T_INVALID_STAG;
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * Map memory access error to RDMAP protection error
> >> >> + */
> >> >> +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
> >> >> +{
> >> >> +	if (state == E_STAG_INVALID)
> >> >> +		return RDMAP_ECODE_INVALID_STAG;
> >> >> +	if (state == E_BASE_BOUNDS)
> >> >> +		return RDMAP_ECODE_BASE_BOUNDS;
> >> >> +	if (state == E_PD_MISMATCH)
> >> >> +		return RDMAP_ECODE_STAG_NOT_ASSOC;
> >> >> +	if (state == E_ACCESS_PERM)
> >> >> +		return RDMAP_ECODE_ACCESS_RIGHTS;
> >> >> +
> >> >> +	return RDMAP_ECODE_UNSPECIFIED;
> >> >> +}
> >> >> +
> >> >> +void siw_init_terminate(struct siw_qp *qp, enum term_elayer
> >layer,
> >> >> +			u8 etype, u8 ecode, int in_tx)
> >> >> +{
> >> >> +	if (!qp->term_info.valid) {
> >> >> +		memset(&qp->term_info, 0, sizeof(qp->term_info));
> >> >> +		qp->term_info.layer = layer;
> >> >> +		qp->term_info.etype = etype;
> >> >> +		qp->term_info.ecode = ecode;
> >> >> +		qp->term_info.in_tx = in_tx;
> >> >> +		qp->term_info.valid = 1;
> >> >> +	}
> >> >> +	siw_dbg_qp(qp,
> >> >> +		   "init TERM: layer %d, type %d, code %d, in tx %s\n",
> >> >> +		   layer, etype, ecode, in_tx ? "yes" : "no");
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * Send a TERMINATE message, as defined in RFC's
> >> >5040/5041/5044/6581.
> >> >> + * Sending TERMINATE messages is best effort - such messages
> >> >> + * can only be send if the QP is still connected and it does
> >> >> + * not have another outbound message in-progress, i.e. the
> >> >> + * TERMINATE message must not interfer with an incomplete
> >current
> >> >> + * transmit operation.
> >> >> + */
> >> >> +void siw_send_terminate(struct siw_qp *qp)
> >> >> +{
> >> >> +	struct kvec		iov[3];
> >> >> +	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT|MSG_EOR};
> >> >> +	struct iwarp_terminate	*term = NULL;
> >> >> +	union iwarp_hdr		*err_hdr = NULL;
> >> >> +	struct socket		*s = qp->attrs.sk;
> >> >> +	struct siw_iwarp_rx	*rx_ctx = &qp->rx_ctx;
> >> >> +	union iwarp_hdr		*rx_hdr = &rx_ctx->hdr;
> >> >> +	u32 crc = 0;
> >> >> +	int num_frags, len_terminate, rv;
> >> >> +
> >> >> +	if (!qp->term_info.valid)
> >> >> +		return;
> >> >> +
> >> >> +	qp->term_info.valid = 0;
> >> >> +
> >> >> +	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
> >> >> +		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
> >> >> +			   tx_type(tx_wqe(qp)));
> >> >> +		return;
> >> >> +	}
> >> >> +	if (!s && qp->cep)
> >> >> +		/* QP not yet in RTS. Take socket from connection end point
> >*/
> >> >> +		s = qp->cep->llp.sock;
> >> >> +
> >> >> +	if (!s) {
> >> >> +		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
> >> >> +		return;
> >> >> +	}
> >> >> +
> >> >> +	term = kzalloc(sizeof(*term), GFP_KERNEL);
> >> >> +	if (!term)
> >> >> +		return;
> >> >> +
> >> >> +	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
> >> >> +	term->ddp_mo = 0;
> >> >> +	term->ddp_msn = cpu_to_be32(1);
> >> >> +
> >> >> +	iov[0].iov_base = term;
> >> >> +	iov[0].iov_len = sizeof(*term);
> >> >> +
> >> >> +	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
> >> >> +	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
> >> >> +	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
> >> >> +		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
> >> >> +		if (!err_hdr) {
> >> >> +			kfree(term);
> >> >> +			return;
> >> >> +		}
> >> >> +	}
> >> >> +	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
> >> >> +	       sizeof(struct iwarp_ctrl));
> >> >> +
> >> >> +	__rdmap_term_set_layer(term, qp->term_info.layer);
> >> >> +	__rdmap_term_set_etype(term, qp->term_info.etype);
> >> >> +	__rdmap_term_set_ecode(term, qp->term_info.ecode);
> >> >> +
> >> >> +	switch (qp->term_info.layer) {
> >> >> +
> >> >> +	case TERM_ERROR_LAYER_RDMAP:
> >> >> +		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
> >> >> +			/* No additional DDP/RDMAP header to be included */
> >> >> +			break;
> >> >> +
> >> >> +		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
> >> >> +			/*
> >> >> +			 * Complete RDMAP frame will get attached, and
> >> >> +			 * DDP segment length is valid
> >> >> +			 */
> >> >> +			term->flag_m = 1;
> >> >> +			term->flag_d = 1;
> >> >> +			term->flag_r = 1;
> >> >> +
> >> >> +			if (qp->term_info.in_tx) {
> >> >> +				struct iwarp_rdma_rreq *rreq;
> >> >> +				struct siw_wqe *wqe = tx_wqe(qp);
> >> >> +
> >> >> +				/* Inbound RREQ error, detected during
> >> >> +				 * RRESP creation. Take state from
> >> >> +				 * current TX work queue element to
> >> >> +				 * reconstruct peers RREQ.
> >> >> +				 */
> >> >> +				rreq = (struct iwarp_rdma_rreq *)err_hdr;
> >> >> +
> >> >> +				memcpy(&rreq->ctrl,
> >> >> +				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
> >> >> +				       sizeof(struct iwarp_ctrl));
> >> >> +
> >> >> +				rreq->rsvd = 0;
> >> >> +				rreq->ddp_qn =
> >> >> +					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
> >> >> +
> >> >> +				/* Provide RREQ's MSN as kept aside */
> >> >> +				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
> >> >> +
> >> >> +				rreq->ddp_mo = htonl(wqe->processed);
> >> >> +				rreq->sink_stag = htonl(wqe->sqe.rkey);
> >> >> +				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
> >> >> +				rreq->read_size = htonl(wqe->sqe.sge[0].length);
> >> >> +				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
> >> >> +				rreq->source_to =
> >> >> +					cpu_to_be64(wqe->sqe.sge[0].laddr);
> >> >> +
> >> >> +				iov[1].iov_base = rreq;
> >> >> +				iov[1].iov_len = sizeof(*rreq);
> >> >> +
> >> >> +				rx_hdr = (union iwarp_hdr *)rreq;
> >> >> +			} else {
> >> >> +				/* Take RDMAP/DDP information from
> >> >> +				 * current (failed) inbound frame.
> >> >> +				 */
> >> >> +				iov[1].iov_base = rx_hdr;
> >> >> +
> >> >> +				if (__rdmap_opcode(&rx_hdr->ctrl) ==
> >> >> +				    RDMAP_RDMA_READ_REQ)
> >> >> +					iov[1].iov_len =
> >> >> +						sizeof(struct iwarp_rdma_rreq);
> >> >> +				else /* SEND type */
> >> >> +					iov[1].iov_len =
> >> >> +						sizeof(struct iwarp_send);
> >> >> +			}
> >> >> +		} else {
> >> >> +			/* Do not report DDP hdr information if packet
> >> >> +			 * layout is unknown
> >> >> +			 */
> >> >> +			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
> >> >> +			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
> >> >> +				break;
> >> >> +
> >> >> +			iov[1].iov_base = rx_hdr;
> >> >> +
> >> >> +			/* Only DDP frame will get attached */
> >> >> +			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
> >> >> +				iov[1].iov_len =
> >> >> +					sizeof(struct iwarp_rdma_write);
> >> >> +			else
> >> >> +				iov[1].iov_len = sizeof(struct iwarp_send);
> >> >> +
> >> >> +			term->flag_m = 1;
> >> >> +			term->flag_d = 1;
> >> >> +		}
> >> >> +		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
> >> >> +
> >> >> +		break;
> >> >> +
> >> >> +	case TERM_ERROR_LAYER_DDP:
> >> >> +		/* Report error encountered while DDP processing.
> >> >> +		 * This can only happen as a result of inbound
> >> >> +		 * DDP processing
> >> >> +		 */
> >> >> +
> >> >> +		/* Do not report DDP hdr information if packet
> >> >> +		 * layout is unknown
> >> >> +		 */
> >> >> +		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
> >> >> +		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
> >> >> +		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
> >> >> +		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
> >> >> +			break;
> >> >> +
> >> >> +		iov[1].iov_base = rx_hdr;
> >> >> +
> >> >> +		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
> >> >> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
> >> >> +		else
> >> >> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
> >> >> +
> >> >> +		term->flag_m = 1;
> >> >> +		term->flag_d = 1;
> >> >> +
> >> >> +		break;
> >> >> +
> >> >> +	default:
> >> >> +		break;
> >> >> +
> >> >> +	}
> >> >> +	if (term->flag_m || term->flag_d || term->flag_r) {
> >> >> +		iov[2].iov_base = &crc;
> >> >> +		iov[2].iov_len = sizeof(crc);
> >> >> +		len_terminate = sizeof(*term) + iov[1].iov_len +
> >MPA_CRC_SIZE;
> >> >> +		num_frags = 3;
> >> >> +	} else {
> >> >> +		iov[1].iov_base = &crc;
> >> >> +		iov[1].iov_len = sizeof(crc);
> >> >> +		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
> >> >> +		num_frags = 2;
> >> >> +	}
> >> >> +
> >> >> +	/* Adjust DDP Segment Length parameter, if valid */
> >> >> +	if (term->flag_m) {
> >> >> +		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
> >> >> +		enum rdma_opcode op = __rdmap_opcode(&rx_hdr->ctrl);
> >> >> +
> >> >> +		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
> >> >> +		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
> >> >> +	}
> >> >> +
> >> >> +	term->ctrl.mpa_len = cpu_to_be16(len_terminate -
> >> >> +					 (MPA_HDR_SIZE + MPA_CRC_SIZE));
> >> >> +	if (qp->tx_ctx.mpa_crc_hd) {
> >> >> +		crypto_shash_init(rx_ctx->mpa_crc_hd);
> >> >> +		if (siw_crc_array(rx_ctx->mpa_crc_hd, (u8 *)iov[0].iov_base,
> >> >> +				  iov[0].iov_len))
> >> >> +			goto out;
> >> >> +
> >> >> +		if (num_frags == 3) {
> >> >> +			if (siw_crc_array(rx_ctx->mpa_crc_hd,
> >> >> +					  (u8 *)iov[1].iov_base,
> >> >> +					  iov[1].iov_len))
> >> >> +				goto out;
> >> >> +		}
> >> >> +		crypto_shash_final(rx_ctx->mpa_crc_hd, (u8 *)&crc);
> >> >> +	}
> >> >> +
> >> >> +	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
> >> >> +	siw_dbg_qp(qp,
> >> >> +		   "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
> >> >> +		   rv == len_terminate ? "success" : "failure",
> >> >> +		   __rdmap_term_layer(term), __rdmap_term_etype(term),
> >> >> +		   __rdmap_term_ecode(term), rv);
> >> >> +out:
> >> >> +	kfree(term);
> >> >> +	kfree(err_hdr);
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * handle all attrs other than state
> >> >> + */
> >> >> +static void siw_qp_modify_nonstate(struct siw_qp *qp,
> >> >> +				   struct siw_qp_attrs *attrs,
> >> >> +				   enum siw_qp_attr_mask mask)
> >> >> +{
> >> >> +	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
> >> >> +		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
> >> >> +			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
> >> >> +		else
> >> >> +			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
> >> >> +
> >> >> +		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
> >> >> +			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
> >> >> +		else
> >> >> +			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
> >> >> +
> >> >> +		if (attrs->flags & SIW_RDMA_READ_ENABLED)
> >> >> +			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
> >> >> +		else
> >> >> +			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
> >> >> +	}
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * caller holds qp->state_lock
> >> >> + */
> >> >> +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs
> >*attrs,
> >> >> +		  enum siw_qp_attr_mask mask)
> >> >> +{
> >> >> +	int	drop_conn = 0, rv = 0;
> >> >> +
> >> >> +	if (!mask)
> >> >> +		return 0;
> >> >> +
> >> >> +	siw_dbg_qp(qp, "state: %s => %s\n",
> >> >> +		   siw_qp_state_to_string[qp->attrs.state],
> >> >> +		   siw_qp_state_to_string[attrs->state]);
> >> >> +
> >> >> +	if (mask != SIW_QP_ATTR_STATE)
> >> >> +		siw_qp_modify_nonstate(qp, attrs, mask);
> >> >> +
> >> >> +	if (!(mask & SIW_QP_ATTR_STATE))
> >> >> +		return 0;
> >> >> +
> >> >> +	switch (qp->attrs.state) {
> >> >> +
> >> >> +	case SIW_QP_STATE_IDLE:
> >> >> +	case SIW_QP_STATE_RTR:
> >> >> +
> >> >> +		switch (attrs->state) {
> >> >> +
> >> >> +		case SIW_QP_STATE_RTS:
> >> >> +
> >> >> +			if (attrs->flags & SIW_MPA_CRC) {
> >> >> +				rv = siw_qp_enable_crc(qp);
> >> >> +				if (rv)
> >> >> +					break;
> >> >> +			}
> >> >> +			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
> >> >> +				siw_dbg_qp(qp, "no socket\n");
> >> >> +				rv = -EINVAL;
> >> >> +				break;
> >> >> +			}
> >> >> +			if (!(mask & SIW_QP_ATTR_MPA)) {
> >> >> +				siw_dbg_qp(qp, "no MPA\n");
> >> >> +				rv = -EINVAL;
> >> >> +				break;
> >> >> +			}
> >> >> +			siw_dbg_qp(qp, "enter rts, peer 0x%08x, loc 0x%08x\n",
> >> >> +				   qp->cep->llp.raddr.sin_addr.s_addr,
> >> >> +				   qp->cep->llp.laddr.sin_addr.s_addr);
> >> >> +			/*
> >> >> +			 * Initialize iWARP TX state
> >> >> +			 */
> >> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
> >> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
> >> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
> >> >> +
> >> >> +			/*
> >> >> +			 * Initialize iWARP RX state
> >> >> +			 */
> >> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
> >> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
> >> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
> >> >> +
> >> >> +			/*
> >> >> +			 * init IRD free queue, caller has already checked
> >> >> +			 * limits.
> >> >> +			 */
> >> >> +			rv = siw_qp_readq_init(qp, attrs->irq_size,
> >> >> +					       attrs->orq_size);
> >> >> +			if (rv)
> >> >> +				break;
> >> >> +
> >> >> +			qp->attrs.sk = attrs->sk;
> >> >> +			qp->attrs.state = SIW_QP_STATE_RTS;
> >> >> +
> >> >> +			break;
> >> >> +
> >> >> +		case SIW_QP_STATE_ERROR:
> >> >> +			siw_rq_flush(qp);
> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> >> +			if (qp->cep) {
> >> >> +				siw_cep_put(qp->cep);
> >> >> +				qp->cep = NULL;
> >> >> +			}
> >> >> +			break;
> >> >> +
> >> >> +		case SIW_QP_STATE_RTR:
> >> >> +			/* ignore */
> >> >> +			break;
> >> >> +
> >> >> +		default:
> >> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> >> >> +				   siw_qp_state_to_string[qp->attrs.state],
> >> >> +				   siw_qp_state_to_string[attrs->state]);
> >> >> +			break;
> >> >> +		}
> >> >> +		break;
> >> >> +
> >> >> +	case SIW_QP_STATE_RTS:
> >> >> +
> >> >> +		switch (attrs->state) {
> >> >> +
> >> >> +		case SIW_QP_STATE_CLOSING:
> >> >> +			/*
> >> >> +			 * Verbs: move to IDLE if SQ and ORQ are empty.
> >> >> +			 * Move to ERROR otherwise. But first of all we must
> >> >> +			 * close the connection. So we keep CLOSING or ERROR
> >> >> +			 * as a transient state, schedule connection drop work
> >> >> +			 * and wait for the socket state change upcall to
> >> >> +			 * come back closed.
> >> >> +			 */
> >> >> +			if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
> >> >> +				qp->attrs.state = SIW_QP_STATE_CLOSING;
> >> >> +			} else {
> >> >> +				qp->attrs.state = SIW_QP_STATE_ERROR;
> >> >> +				siw_sq_flush(qp);
> >> >> +			}
> >> >> +			siw_rq_flush(qp);
> >> >> +
> >> >> +			drop_conn = 1;
> >> >> +			break;
> >> >> +
> >> >> +		case SIW_QP_STATE_TERMINATE:
> >> >> +			qp->attrs.state = SIW_QP_STATE_TERMINATE;
> >> >> +
> >> >> +			siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
> >> >> +					   RDMAP_ETYPE_CATASTROPHIC,
> >> >> +					   RDMAP_ECODE_UNSPECIFIED, 1);
> >> >> +			drop_conn = 1;
> >> >> +
> >> >> +			break;
> >> >> +
> >> >> +		case SIW_QP_STATE_ERROR:
> >> >> +			/*
> >> >> +			 * This is an emergency close.
> >> >> +			 *
> >> >> +			 * Any in progress transmit operation will get
> >> >> +			 * cancelled.
> >> >> +			 * This will likely result in a protocol failure,
> >> >> +			 * if a TX operation is in transit. The caller
> >> >> +			 * could unconditional wait to give the current
> >> >> +			 * operation a chance to complete.
> >> >> +			 * Esp., how to handle the non-empty IRQ case?
> >> >> +			 * The peer was asking for data transfer at a valid
> >> >> +			 * point in time.
> >> >> +			 */
> >> >> +			siw_sq_flush(qp);
> >> >> +			siw_rq_flush(qp);
> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> >> +			drop_conn = 1;
> >> >> +
> >> >> +			break;
> >> >> +
> >> >> +		default:
> >> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> >> >> +				   siw_qp_state_to_string[qp->attrs.state],
> >> >> +				   siw_qp_state_to_string[attrs->state]);
> >> >> +			break;
> >> >> +		}
> >> >> +		break;
> >> >> +
> >> >> +	case SIW_QP_STATE_TERMINATE:
> >> >> +
> >> >> +		switch (attrs->state) {
> >> >> +
> >> >> +		case SIW_QP_STATE_ERROR:
> >> >> +			siw_rq_flush(qp);
> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> >> +
> >> >> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
> >> >> +				siw_sq_flush(qp);
> >> >> +
> >> >> +			break;
> >> >> +
> >> >> +		default:
> >> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> >> >> +				   siw_qp_state_to_string[qp->attrs.state],
> >> >> +				   siw_qp_state_to_string[attrs->state]);
> >> >> +		}
> >> >> +		break;
> >> >> +
> >> >> +	case SIW_QP_STATE_CLOSING:
> >> >> +
> >> >> +		switch (attrs->state) {
> >> >> +
> >> >> +		case SIW_QP_STATE_IDLE:
> >> >> +			WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
> >> >> +			qp->attrs.state = SIW_QP_STATE_IDLE;
> >> >> +
> >> >> +			break;
> >> >> +
> >> >> +		case SIW_QP_STATE_CLOSING:
> >> >> +			/*
> >> >> +			 * The LLP may already moved the QP to closing
> >> >> +			 * due to graceful peer close init
> >> >> +			 */
> >> >> +			break;
> >> >> +
> >> >> +		case SIW_QP_STATE_ERROR:
> >> >> +			/*
> >> >> +			 * QP was moved to CLOSING by LLP event
> >> >> +			 * not yet seen by user.
> >> >> +			 */
> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
> >> >> +
> >> >> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
> >> >> +				siw_sq_flush(qp);
> >> >> +
> >> >> +			siw_rq_flush(qp);
> >> >> +
> >> >> +			break;
> >> >> +
> >> >> +		default:
> >> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
> >> >> +				   siw_qp_state_to_string[qp->attrs.state],
> >> >> +				   siw_qp_state_to_string[attrs->state]);
> >> >> +
> >> >> +			return -ECONNABORTED;
> >> >> +		}
> >> >> +		break;
> >> >> +
> >> >> +	default:
> >> >> +		siw_dbg_qp(qp, " noop: state %s\n",
> >> >> +			   siw_qp_state_to_string[qp->attrs.state]);
> >> >> +		break;
> >> >> +	}
> >> >> +	if (drop_conn)
> >> >> +		siw_qp_cm_drop(qp, 0);
> >> >> +
> >> >> +	return rv;
> >> >> +}
> >> >> +
> >> >> +struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int
> >id)
> >> >> +{
> >> >> +	struct siw_qp *qp =  siw_qp_id2obj(to_siw_dev(base_dev), id);
> >> >> +
> >> >> +	if (qp) {
> >> >> +		/*
> >> >> +		 * siw_qp_id2obj() increments object reference count
> >> >> +		 */
> >> >> +		siw_qp_put(qp);
> >> >> +		siw_dbg_qp(qp, "got base QP");
> >> >> +
> >> >> +		return &qp->base_qp;
> >> >> +	}
> >> >> +	return (struct ib_qp *)NULL;
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * siw_check_mem()
> >> >> + *
> >> >> + * Check protection domain, STAG state, access permissions and
> >> >> + * address range for memory object.
> >> >> + *
> >> >> + * @pd:		Protection Domain memory should belong to
> >> >> + * @mem:	memory to be checked
> >> >> + * @addr:	starting addr of mem
> >> >> + * @perms:	requested access permissions
> >> >> + * @len:	len of memory interval to be checked
> >> >> + *
> >> >> + */
> >> >> +int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem, u64
> >> >addr,
> >> >> +		  enum siw_access_flags perms, int len)
> >> >> +{
> >> >> +	if (siw_mem2mr(mem)->pd != pd) {
> >> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: pd mismatch\n", OBJ_ID(pd));
> >> >> +		return -E_PD_MISMATCH;
> >> >> +	}
> >> >> +	if (!mem->stag_valid) {
> >> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: stag 0x%08x invalid\n",
> >> >> +			OBJ_ID(pd), OBJ_ID(mem));
> >> >> +		return -E_STAG_INVALID;
> >> >> +	}
> >> >> +	/*
> >> >> +	 * check access permissions
> >> >> +	 */
> >> >> +	if ((mem->perms & perms) < perms) {
> >> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: permissions 0x%08x <
> >0x%08x\n",
> >> >> +			OBJ_ID(pd), mem->perms, perms);
> >> >> +		return -E_ACCESS_PERM;
> >> >> +	}
> >> >> +	/*
> >> >> +	 * Check address interval: we relax check to allow memory
> >> >shrinked
> >> >> +	 * from the start address _after_ placing or fetching len
> >bytes.
> >> >> +	 * TODO: this relaxation is probably overdone
> >> >> +	 */
> >> >> +	if (addr < mem->va || addr + len > mem->va + mem->len) {
> >> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: MEM interval len %d\n",
> >> >> +			OBJ_ID(pd), len);
> >> >> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx) out of
> >bounds\n",
> >> >> +			(unsigned long long)addr,
> >> >> +			(unsigned long long)(addr + len));
> >> >> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx] LKey=0x%08x\n",
> >> >> +			(unsigned long long)mem->va,
> >> >> +			(unsigned long long)(mem->va + mem->len),
> >> >> +			OBJ_ID(mem));
> >> >> +
> >> >> +		return -E_BASE_BOUNDS;
> >> >> +	}
> >> >> +	return E_ACCESS_OK;
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * siw_check_sge()
> >> >> + *
> >> >> + * Check SGE for access rights in given interval
> >> >> + *
> >> >> + * @pd:		Protection Domain memory should belong to
> >> >> + * @sge:	SGE to be checked
> >> >> + * @mem:	array of memory references
> >> >> + * @perms:	requested access permissions
> >> >> + * @off:	starting offset in SGE
> >> >> + * @len:	len of memory interval to be checked
> >> >> + *
> >> >> + * NOTE: Function references SGE's memory object (mem->obj)
> >> >> + * if not yet done. New reference is kept if check went ok and
> >> >> + * released if check failed. If mem->obj is already valid, no
> >new
> >> >> + * lookup is being done and mem is not released it check fails.
> >> >> + */
> >> >> +int
> >> >> +siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
> >> >> +	      struct siw_mem *mem[], enum siw_access_flags perms,
> >> >> +	      u32 off, int len)
> >> >> +{
> >> >> +	struct siw_device *sdev = pd->hdr.sdev;
> >> >> +	int new_ref = 0, rv = E_ACCESS_OK;
> >> >> +
> >> >> +	if (len + off > sge->length) {
> >> >> +		rv = -E_BASE_BOUNDS;
> >> >> +		goto fail;
> >> >> +	}
> >> >> +	if (*mem == NULL) {
> >> >> +		*mem = siw_mem_id2obj(sdev, sge->lkey >> 8);
> >> >> +		if (*mem == NULL) {
> >> >> +			rv = -E_STAG_INVALID;
> >> >> +			goto fail;
> >> >> +		}
> >> >> +		new_ref = 1;
> >> >> +	}
> >> >> +
> >> >> +	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
> >> >> +	if (rv)
> >> >> +		goto fail;
> >> >> +
> >> >> +	return 0;
> >> >> +
> >> >> +fail:
> >> >> +	if (new_ref) {
> >> >> +		siw_mem_put(*mem);
> >> >> +		*mem = NULL;
> >> >> +	}
> >> >> +	return rv;
> >> >> +}
> >> >> +
> >> >> +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
> >> >> +{
> >> >> +	rreq->id = sqe->id;
> >> >> +	rreq->opcode = sqe->opcode;
> >> >> +	rreq->sge[0].laddr = sqe->sge[0].laddr;
> >> >> +	rreq->sge[0].length = sqe->sge[0].length;
> >> >> +	rreq->sge[0].lkey = sqe->sge[0].lkey;
> >> >> +	rreq->sge[1].lkey = sqe->sge[1].lkey;
> >> >> +	rreq->flags = sqe->flags | SIW_WQE_VALID;
> >> >> +	rreq->num_sge = 1;
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * Must be called with SQ locked.
> >> >> + * To avoid complete SQ starvation by constant inbound READ
> >> >requests,
> >> >> + * the active IRQ will not be served after qp->irq_burst, if
> >the
> >> >> + * SQ has pending work.
> >> >> + */
> >> >> +int siw_activate_tx(struct siw_qp *qp)
> >> >> +{
> >> >> +	struct siw_sqe	*irqe, *sqe;
> >> >> +	struct siw_wqe	*wqe = tx_wqe(qp);
> >> >> +	int rv = 1;
> >> >> +
> >> >> +	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
> >> >> +
> >> >> +	if (irqe->flags & SIW_WQE_VALID) {
> >> >> +		sqe = sq_get_next(qp);
> >> >> +
> >> >> +		/*
> >> >> +		 * Avoid local WQE processing starvation in case
> >> >> +		 * of constant inbound READ request stream
> >> >> +		 */
> >> >> +		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
> >> >> +			qp->irq_burst = 0;
> >> >> +			goto skip_irq;
> >> >> +		}
> >> >> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> >> >> +		wqe->wr_status = SIW_WR_QUEUED;
> >> >> +
> >> >> +		/* start READ RESPONSE */
> >> >> +		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
> >> >> +		wqe->sqe.flags = 0;
> >> >> +		if (irqe->num_sge) {
> >> >> +			wqe->sqe.num_sge = 1;
> >> >> +			wqe->sqe.sge[0].length = irqe->sge[0].length;
> >> >> +			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
> >> >> +			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
> >> >> +		} else {
> >> >> +			wqe->sqe.num_sge = 0;
> >> >> +		}
> >> >> +
> >> >> +		/* Retain original RREQ's message sequence number for
> >> >> +		 * potential error reporting cases.
> >> >> +		 */
> >> >> +		wqe->sqe.sge[1].length = irqe->sge[1].length;
> >> >> +
> >> >> +		wqe->sqe.rkey = irqe->rkey;
> >> >> +		wqe->sqe.raddr = irqe->raddr;
> >> >> +
> >> >> +		wqe->processed = 0;
> >> >> +		qp->irq_get++;
> >> >> +
> >> >> +		/* mark current IRQ entry free */
> >> >> +		smp_store_mb(irqe->flags, 0);
> >> >> +
> >> >> +		goto out;
> >> >> +	}
> >> >> +
> >> >> +	sqe = sq_get_next(qp);
> >> >> +	if (sqe) {
> >> >> +skip_irq:
> >> >> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> >> >> +		wqe->wr_status = SIW_WR_QUEUED;
> >> >> +
> >> >> +		/* First copy SQE to kernel private memory */
> >> >> +		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
> >> >> +
> >> >> +		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
> >> >> +			rv = -EINVAL;
> >> >> +			goto out;
> >> >> +		}
> >> >> +		if (wqe->sqe.flags & SIW_WQE_INLINE) {
> >> >> +			if (wqe->sqe.opcode != SIW_OP_SEND &&
> >> >> +			    wqe->sqe.opcode != SIW_OP_WRITE) {
> >> >> +				rv = -EINVAL;
> >> >> +				goto out;
> >> >> +			}
> >> >> +			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
> >> >> +				rv = -EINVAL;
> >> >> +				goto out;
> >> >> +			}
> >> >> +			wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
> >> >> +			wqe->sqe.sge[0].lkey = 0;
> >> >> +			wqe->sqe.num_sge = 1;
> >> >> +		}
> >> >> +		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
> >> >> +			/* A READ cannot be fenced */
> >> >> +			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
> >> >> +			    wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV)) {
> >> >> +				siw_dbg_qp(qp, "cannot fence read\n");
> >> >> +				rv = -EINVAL;
> >> >> +				goto out;
> >> >> +			}
> >> >> +			spin_lock(&qp->orq_lock);
> >> >> +
> >> >> +			if (!siw_orq_empty(qp)) {
> >> >> +				qp->tx_ctx.orq_fence = 1;
> >> >> +				rv = 0;
> >> >> +			}
> >> >> +			spin_unlock(&qp->orq_lock);
> >> >> +
> >> >> +		} else if (wqe->sqe.opcode == SIW_OP_READ ||
> >> >> +			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
> >> >> +			struct siw_sqe	*rreq;
> >> >> +
> >> >> +			wqe->sqe.num_sge = 1;
> >> >> +
> >> >> +			spin_lock(&qp->orq_lock);
> >> >> +
> >> >> +			rreq = orq_get_free(qp);
> >> >> +			if (rreq) {
> >> >> +				/*
> >> >> +				 * Make an immediate copy in ORQ to be ready
> >> >> +				 * to process loopback READ reply
> >> >> +				 */
> >> >> +				siw_read_to_orq(rreq, &wqe->sqe);
> >> >> +				qp->orq_put++;
> >> >> +			} else {
> >> >> +				qp->tx_ctx.orq_fence = 1;
> >> >> +				rv = 0;
> >> >> +			}
> >> >> +			spin_unlock(&qp->orq_lock);
> >> >> +		}
> >> >> +
> >> >> +		/* Clear SQE, can be re-used by application */
> >> >> +		smp_store_mb(sqe->flags, 0);
> >> >> +		qp->sq_get++;
> >> >> +	} else {
> >> >> +		rv = 0;
> >> >> +	}
> >> >> +out:
> >> >> +	if (unlikely(rv < 0)) {
> >> >> +		siw_dbg_qp(qp, "error %d\n", rv);
> >> >> +		wqe->wr_status = SIW_WR_IDLE;
> >> >> +	}
> >> >> +	return rv;
> >> >> +}
> >> >> +
> >> >> +/*
> >> >> + * Check if current CQ state qualifies for
> >> >> + * calling CQ completion handler. Must be
> >> >> + * called with CQ lock held.
> >> >> + */
> >> >> +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
> >> >> +{
> >> >> +	u64 cq_notify;
> >> >> +
> >> >> +	if (!cq->base_cq.comp_handler)
> >> >> +		return false;
> >> >> +
> >> >> +	cq_notify = READ_ONCE(*cq->notify);
> >> >> +
> >> >> +	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
> >> >> +	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
> >> >> +	     (flags & SIW_WQE_SOLICITED))) {
> >> >> +		/* dis-arm CQ */
> >> >> +		smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
> >> >> +
> >> >> +		return true;
> >> >> +	}
> >> >> +	return false;
> >> >> +}
> >> >> +
> >> >> +/* Must be called without holding CQ lock */
> >> >> +static inline void siw_cq_completion(struct siw_cq *cq)
> >> >> +{
> >> >> +	siw_dbg_obj(cq, "Completion\n");
> >> >> +	(*cq->base_cq.comp_handler)(&cq->base_cq,
> >> >cq->base_cq.cq_context);
> >> >> +}
> >> >> +
> >> >> +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe,
> >u32
> >> >bytes,
> >> >> +		     enum siw_wc_status status)
> >> >> +{
> >> >> +	struct siw_cq *cq = qp->scq;
> >> >> +	int rv = 0;
> >> >> +
> >> >> +	if (cq) {
> >> >> +		u32 sqe_flags = sqe->flags;
> >> >> +		struct siw_cqe *cqe;
> >> >> +		u32 idx;
> >> >> +		unsigned long flags;
> >> >> +
> >> >> +		spin_lock_irqsave(&cq->lock, flags);
> >> >> +
> >> >> +		idx = cq->cq_put % cq->num_cqe;
> >> >> +		cqe = &cq->queue[idx];
> >> >> +
> >> >> +		if (!READ_ONCE(cqe->flags)) {
> >> >> +			bool notify;
> >> >> +
> >> >> +			cqe->id = sqe->id;
> >> >> +			cqe->opcode = sqe->opcode;
> >> >> +			cqe->status = status;
> >> >> +			cqe->imm_data = 0;
> >> >> +			cqe->bytes = bytes;
> >> >> +
> >> >> +			if (cq->kernel_verbs) {
> >> >
> >> >kernel_verbs is managed by Ib/core, why should driver know about
> >it?
> >> >
> >> User land CQE's carry the corresponding QP ID, kernel clients
> >expect a QP
> >> pointer here. That's where the difference comes from. This
> >distinction
> >> between kernel and user clients is needed in more places, e.g.
> >since the
> >> user land's CQE array is memory mapped, where the kernel land's is
> >not.
> >
> >It is passed through udata, your code should check existence of
> >"udata"
> >and not manages user/kernel flag.
> >
> >
>
> Right. I simply wanted to avoid those potentially cache thrashing
> efforts - the udata pointer of the ib_device is quite far away
> from the CQ array. So we kind of trade redundancy for performance,
> and CQE creation is definitively on the fast path...

Do you have any performance numbers to support need of this user/kernel flag?
Also rdma_is_kernel_res() can give you very fast answer if you have user
or kernel object.

Thanks

>
> Thanks
> Bernard.
>
Bernard Metzler Feb. 27, 2019, 4:17 p.m. UTC | #6
-----"Leon Romanovsky" <leon@kernel.org> wrote: -----

>To: "Bernard Metzler" <BMT@zurich.ibm.com>
>From: "Leon Romanovsky" <leon@kernel.org>
>Date: 02/27/2019 05:01PM
>Cc: linux-rdma@vger.kernel.org
>Subject: Re: [PATCH v5 08/13] SIW queue pair methods
>
>On Wed, Feb 27, 2019 at 02:59:26PM +0000, Bernard Metzler wrote:
>> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>>
>> >To: "Bernard Metzler" <BMT@zurich.ibm.com>
>> >From: "Leon Romanovsky" <leon@kernel.org>
>> >Date: 02/27/2019 01:49PM
>> >Cc: linux-rdma@vger.kernel.org
>> >Subject: Re: [PATCH v5 08/13] SIW queue pair methods
>> >
>> >On Wed, Feb 27, 2019 at 11:54:07AM +0000, Bernard Metzler wrote:
>> >> -----"Leon Romanovsky" <leon@kernel.org> wrote: -----
>> >>
>> >> >To: "Bernard Metzler" <bmt@zurich.ibm.com>
>> >> >From: "Leon Romanovsky" <leon@kernel.org>
>> >> >Date: 02/24/2019 02:19PM
>> >> >Cc: linux-rdma@vger.kernel.org
>> >> >Subject: Re: [PATCH v5 08/13] SIW queue pair methods
>> >> >
>> >> >On Tue, Feb 19, 2019 at 11:08:58AM +0100, Bernard Metzler
>wrote:
>> >> >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
>> >> >> ---
>> >> >>  drivers/infiniband/sw/siw/siw_qp.c | 1478
>> >> >++++++++++++++++++++++++++++
>> >> >>  1 file changed, 1478 insertions(+)
>> >> >>  create mode 100644 drivers/infiniband/sw/siw/siw_qp.c
>> >> >>
>> >> >> diff --git a/drivers/infiniband/sw/siw/siw_qp.c
>> >> >b/drivers/infiniband/sw/siw/siw_qp.c
>> >> >> new file mode 100644
>> >> >> index 000000000000..75fd151dae39
>> >> >> --- /dev/null
>> >> >> +++ b/drivers/infiniband/sw/siw/siw_qp.c
>> >> >> @@ -0,0 +1,1478 @@
>> >> >> +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
>> >> >> +/*
>> >> >> + * Software iWARP device driver
>> >> >> + *
>> >> >> + * Authors: Bernard Metzler <bmt@zurich.ibm.com>
>> >> >> + *
>> >> >> + * Copyright (c) 2008-2018, IBM Corporation
>> >> >> + *
>> >> >> + * This software is available to you under a choice of one
>of
>> >two
>> >> >> + * licenses. You may choose to be licensed under the terms
>of
>> >the
>> >> >GNU
>> >> >> + * General Public License (GPL) Version 2, available from
>the
>> >file
>> >> >> + * COPYING in the main directory of this source tree, or the
>> >> >> + * BSD license below:
>> >> >> + *
>> >> >> + *   Redistribution and use in source and binary forms, with
>or
>> >> >> + *   without modification, are permitted provided that the
>> >> >following
>> >> >> + *   conditions are met:
>> >> >> + *
>> >> >> + *   - Redistributions of source code must retain the above
>> >> >copyright notice,
>> >> >> + *     this list of conditions and the following disclaimer.
>> >> >> + *
>> >> >> + *   - Redistributions in binary form must reproduce the
>above
>> >> >copyright
>> >> >> + *     notice, this list of conditions and the following
>> >> >disclaimer in the
>> >> >> + *     documentation and/or other materials provided with
>the
>> >> >distribution.
>> >> >> + *
>> >> >> + *   - Neither the name of IBM nor the names of its
>> >contributors
>> >> >may be
>> >> >> + *     used to endorse or promote products derived from this
>> >> >software without
>> >> >> + *     specific prior written permission.
>> >> >> + *
>> >> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
>> >KIND,
>> >> >> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
>> >WARRANTIES
>> >> >OF
>> >> >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
>> >> >> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
>COPYRIGHT
>> >> >HOLDERS
>> >> >> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
>WHETHER
>> >IN
>> >> >AN
>> >> >> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
>OF
>> >OR
>> >> >IN
>> >> >> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>IN
>> >> >THE
>> >> >> + * SOFTWARE.
>> >> >> + */
>> >> >> +
>> >> >> +#include <linux/errno.h>
>> >> >> +#include <linux/types.h>
>> >> >> +#include <linux/net.h>
>> >> >> +#include <linux/file.h>
>> >> >> +#include <linux/scatterlist.h>
>> >> >> +#include <linux/highmem.h>
>> >> >> +#include <linux/vmalloc.h>
>> >> >> +#include <asm/barrier.h>
>> >> >> +#include <net/sock.h>
>> >> >> +#include <net/tcp_states.h>
>> >> >> +#include <net/tcp.h>
>> >> >> +
>> >> >> +#include <rdma/iw_cm.h>
>> >> >> +#include <rdma/ib_verbs.h>
>> >> >> +#include <rdma/ib_smi.h>
>> >> >> +#include <rdma/ib_user_verbs.h>
>> >> >> +
>> >> >> +#include "siw.h"
>> >> >> +#include "siw_obj.h"
>> >> >> +#include "siw_cm.h"
>> >> >> +
>> >> >> +static char
>siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof
>> >> >"TERMINATE"] = {
>> >> >> +	[SIW_QP_STATE_IDLE]		= "IDLE",
>> >> >> +	[SIW_QP_STATE_RTR]		= "RTR",
>> >> >> +	[SIW_QP_STATE_RTS]		= "RTS",
>> >> >> +	[SIW_QP_STATE_CLOSING]		= "CLOSING",
>> >> >> +	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
>> >> >> +	[SIW_QP_STATE_ERROR]		= "ERROR"
>> >> >> +};
>> >> >> +
>> >> >> +/*
>> >> >> + * iWARP (RDMAP, DDP and MPA) parameters as well as
>Softiwarp
>> >> >settings on a
>> >> >> + * per-RDMAP message basis. Please keep order of
>initializer.
>> >All
>> >> >MPA len
>> >> >> + * is initialized to minimum packet size.
>> >> >> + */
>> >> >> +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
>{
>> >> >> +	/* RDMAP_RDMA_WRITE */
>> >> >> +	.hdr_len = sizeof(struct iwarp_rdma_write),
>> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
>> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
>> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> >> +		| cpu_to_be16(RDMAP_RDMA_WRITE),
>> >> >> +	.proc_data = siw_proc_write
>> >> >> +},
>> >> >> +{	/* RDMAP_RDMA_READ_REQ */
>> >> >> +	.hdr_len = sizeof(struct iwarp_rdma_rreq),
>> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
>> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> >> +		| cpu_to_be16(RDMAP_RDMA_READ_REQ),
>> >> >> +	.proc_data = siw_proc_rreq
>> >> >> +},
>> >> >> +{	/* RDMAP_RDMA_READ_RESP */
>> >> >> +	.hdr_len = sizeof(struct iwarp_rdma_rresp),
>> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
>> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
>> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> >> +		| cpu_to_be16(RDMAP_RDMA_READ_RESP),
>> >> >> +	.proc_data = siw_proc_rresp
>> >> >> +},
>> >> >> +{	/* RDMAP_SEND */
>> >> >> +	.hdr_len = sizeof(struct iwarp_send),
>> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
>> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> >> +		| cpu_to_be16(RDMAP_SEND),
>> >> >> +	.proc_data = siw_proc_send
>> >> >> +},
>> >> >> +{	/* RDMAP_SEND_INVAL */
>> >> >> +	.hdr_len = sizeof(struct iwarp_send_inv),
>> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
>> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> >> +		| cpu_to_be16(RDMAP_SEND_INVAL),
>> >> >> +	.proc_data = siw_proc_send
>> >> >> +},
>> >> >> +{	/* RDMAP_SEND_SE */
>> >> >> +	.hdr_len = sizeof(struct iwarp_send),
>> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
>> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> >> +		| cpu_to_be16(RDMAP_SEND_SE),
>> >> >> +	.proc_data = siw_proc_send
>> >> >> +},
>> >> >> +{	/* RDMAP_SEND_SE_INVAL */
>> >> >> +	.hdr_len = sizeof(struct iwarp_send_inv),
>> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
>> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> >> +		| cpu_to_be16(RDMAP_SEND_SE_INVAL),
>> >> >> +	.proc_data = siw_proc_send
>> >> >> +},
>> >> >> +{	/* RDMAP_TERMINATE */
>> >> >> +	.hdr_len = sizeof(struct iwarp_terminate),
>> >> >> +	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
>> >> >> +	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
>> >> >> +		| cpu_to_be16(DDP_VERSION << 8)
>> >> >> +		| cpu_to_be16(RDMAP_VERSION << 6)
>> >> >> +		| cpu_to_be16(RDMAP_TERMINATE),
>> >> >> +	.proc_data = siw_proc_terminate
>> >> >> +} };
>> >> >> +
>> >> >> +void siw_qp_llp_data_ready(struct sock *sk)
>> >> >> +{
>> >> >> +	struct siw_qp		*qp;
>> >> >> +
>> >> >> +	read_lock(&sk->sk_callback_lock);
>> >> >> +
>> >> >> +	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
>> >> >> +		goto done;
>> >> >> +
>> >> >> +	qp = sk_to_qp(sk);
>> >> >> +
>> >> >> +	if (likely(!qp->rx_ctx.rx_suspend &&
>> >> >> +		   down_read_trylock(&qp->state_lock))) {
>> >> >> +		read_descriptor_t rd_desc = {.arg.data = qp, .count = 1};
>> >> >> +
>> >> >> +		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
>> >> >> +			/*
>> >> >> +			 * Implements data receive operation during
>> >> >> +			 * socket callback. TCP gracefully catches
>> >> >> +			 * the case where there is nothing to receive
>> >> >> +			 * (not calling siw_tcp_rx_data() then).
>> >> >> +			 */
>> >> >> +			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
>> >> >> +
>> >> >> +		up_read(&qp->state_lock);
>> >> >> +	} else {
>> >> >> +		siw_dbg_qp(qp, "unable to rx, suspend: %d\n",
>> >> >> +			   qp->rx_ctx.rx_suspend);
>> >> >> +	}
>> >> >> +done:
>> >> >> +	read_unlock(&sk->sk_callback_lock);
>> >> >> +}
>> >> >> +
>> >> >> +void siw_qp_llp_close(struct siw_qp *qp)
>> >> >> +{
>> >> >> +	siw_dbg_qp(qp, "enter llp close, state = %s\n",
>> >> >> +		   siw_qp_state_to_string[qp->attrs.state]);
>> >> >> +
>> >> >> +	down_write(&qp->state_lock);
>> >> >> +
>> >> >> +	qp->rx_ctx.rx_suspend = 1;
>> >> >> +	qp->tx_ctx.tx_suspend = 1;
>> >> >> +	qp->attrs.sk = NULL;
>> >> >> +
>> >> >> +	switch (qp->attrs.state) {
>> >> >> +
>> >> >> +	case SIW_QP_STATE_RTS:
>> >> >> +	case SIW_QP_STATE_RTR:
>> >> >> +	case SIW_QP_STATE_IDLE:
>> >> >> +	case SIW_QP_STATE_TERMINATE:
>> >> >> +
>> >> >> +		qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> >> +
>> >> >> +		break;
>> >> >> +	/*
>> >> >> +	 * SIW_QP_STATE_CLOSING:
>> >> >> +	 *
>> >> >> +	 * This is a forced close. shall the QP be moved to
>> >> >> +	 * ERROR or IDLE ?
>> >> >> +	 */
>> >> >> +	case SIW_QP_STATE_CLOSING:
>> >> >> +		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
>> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> >> +		else
>> >> >> +			qp->attrs.state = SIW_QP_STATE_IDLE;
>> >> >> +
>> >> >> +		break;
>> >> >> +
>> >> >> +	default:
>> >> >> +		siw_dbg_qp(qp, "llp close: no state transition needed:
>%s\n",
>> >> >> +			   siw_qp_state_to_string[qp->attrs.state]);
>> >> >> +		break;
>> >> >> +	}
>> >> >> +	siw_sq_flush(qp);
>> >> >> +	siw_rq_flush(qp);
>> >> >> +
>> >> >> +	/*
>> >> >> +	 * Dereference closing CEP
>> >> >> +	 */
>> >> >> +	if (qp->cep) {
>> >> >> +		siw_cep_put(qp->cep);
>> >> >> +		qp->cep = NULL;
>> >> >> +	}
>> >> >> +
>> >> >> +	up_write(&qp->state_lock);
>> >> >> +
>> >> >> +	siw_dbg_qp(qp, "llp close exit: state %s\n",
>> >> >> +		   siw_qp_state_to_string[qp->attrs.state]);
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * socket callback routine informing about newly available
>send
>> >> >space.
>> >> >> + * Function schedules SQ work for processing SQ items.
>> >> >> + */
>> >> >> +void siw_qp_llp_write_space(struct sock *sk)
>> >> >> +{
>> >> >> +	struct siw_cep	*cep = sk_to_cep(sk);
>> >> >> +
>> >> >> +	cep->sk_write_space(sk);
>> >> >> +
>> >> >> +	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
>> >> >> +		(void) siw_sq_start(cep->qp);
>> >> >> +}
>> >> >> +
>> >> >> +static int siw_qp_readq_init(struct siw_qp *qp, int
>irq_size,
>> >int
>> >> >orq_size)
>> >> >> +{
>> >> >> +	if (!irq_size)
>> >> >> +		irq_size = 1;
>> >> >> +	if (!orq_size)
>> >> >> +		orq_size = 1;
>> >> >> +
>> >> >> +	qp->attrs.irq_size = irq_size;
>> >> >> +	qp->attrs.orq_size = orq_size;
>> >> >> +
>> >> >> +	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
>> >> >> +	if (!qp->irq) {
>> >> >> +		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
>> >> >> +		qp->attrs.irq_size = 0;
>> >> >> +		return -ENOMEM;
>> >> >> +	}
>> >> >> +	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
>> >> >> +	if (!qp->orq) {
>> >> >> +		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
>> >> >> +		qp->attrs.orq_size = 0;
>> >> >> +		qp->attrs.irq_size = 0;
>> >> >> +		vfree(qp->irq);
>> >> >> +		return -ENOMEM;
>> >> >> +	}
>> >> >> +	return 0;
>> >> >> +}
>> >> >> +
>> >> >> +static int siw_qp_enable_crc(struct siw_qp *qp)
>> >> >> +{
>> >> >> +	struct siw_iwarp_rx *c_rx = &qp->rx_ctx;
>> >> >> +	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
>> >> >> +	int rv = 0;
>> >> >> +
>> >> >> +	if (siw_crypto_shash == NULL) {
>> >> >> +		rv = -ENOENT;
>> >> >> +		goto error;
>> >> >> +	}
>> >> >> +	c_tx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
>> >> >> +				   crypto_shash_descsize(siw_crypto_shash),
>> >> >> +				   GFP_KERNEL);
>> >> >> +	c_rx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
>> >> >> +				   crypto_shash_descsize(siw_crypto_shash),
>> >> >> +				   GFP_KERNEL);
>> >> >> +	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
>> >> >> +		rv = -ENOMEM;
>> >> >> +		goto error;
>> >> >> +	}
>> >> >> +	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
>> >> >> +	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
>> >> >> +
>> >> >> +	return 0;
>> >> >> +error:
>> >> >> +	siw_dbg_qp(qp, "falied loading crc32c. error %d\n", rv);
>> >> >> +
>> >> >> +	kfree(c_tx->mpa_crc_hd);
>> >> >> +	kfree(c_rx->mpa_crc_hd);
>> >> >> +
>> >> >> +	c_tx->mpa_crc_hd = c_rx->mpa_crc_hd = NULL;
>> >> >> +
>> >> >> +	return rv;
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * Send a non signalled READ or WRITE to peer side as
>> >negotiated
>> >> >> + * with MPAv2 P2P setup protocol. The work request is only
>> >created
>> >> >> + * as a current active WR and does not consume Send Queue
>> >space.
>> >> >> + *
>> >> >> + * Caller must hold QP state lock.
>> >> >> + */
>> >> >> +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
>> >> >> +{
>> >> >> +	struct siw_wqe	*wqe = tx_wqe(qp);
>> >> >> +	unsigned long flags;
>> >> >> +	int rv = 0;
>> >> >> +
>> >> >> +	spin_lock_irqsave(&qp->sq_lock, flags);
>> >> >> +
>> >> >> +	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
>> >> >> +		spin_unlock_irqrestore(&qp->sq_lock, flags);
>> >> >> +		return -EIO;
>> >> >> +	}
>> >> >> +	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> >> >> +
>> >> >> +	wqe->wr_status = SIW_WR_QUEUED;
>> >> >> +	wqe->sqe.flags = 0;
>> >> >> +	wqe->sqe.num_sge = 1;
>> >> >> +	wqe->sqe.sge[0].length = 0;
>> >> >> +	wqe->sqe.sge[0].laddr = 0;
>> >> >> +	wqe->sqe.sge[0].lkey = 0;
>> >> >> +	/*
>> >> >> +	 * While it must not be checked for inbound zero length
>> >> >> +	 * READ/WRITE, some HW may treat STag 0 special.
>> >> >> +	 */
>> >> >> +	wqe->sqe.rkey = 1;
>> >> >> +	wqe->sqe.raddr = 0;
>> >> >> +	wqe->processed = 0;
>> >> >> +
>> >> >> +	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
>> >> >> +		wqe->sqe.opcode = SIW_OP_WRITE;
>> >> >> +	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
>> >> >> +		struct siw_sqe	*rreq;
>> >> >> +
>> >> >> +		wqe->sqe.opcode = SIW_OP_READ;
>> >> >> +
>> >> >> +		spin_lock(&qp->orq_lock);
>> >> >> +
>> >> >> +		rreq = orq_get_free(qp);
>> >> >> +		if (rreq) {
>> >> >> +			siw_read_to_orq(rreq, &wqe->sqe);
>> >> >> +			qp->orq_put++;
>> >> >> +		} else
>> >> >> +			rv = -EIO;
>> >> >> +
>> >> >> +		spin_unlock(&qp->orq_lock);
>> >> >> +	} else
>> >> >> +		rv = -EINVAL;
>> >> >> +
>> >> >> +	if (rv)
>> >> >> +		wqe->wr_status = SIW_WR_IDLE;
>> >> >> +
>> >> >> +	spin_unlock_irqrestore(&qp->sq_lock, flags);
>> >> >> +
>> >> >> +	if (!rv)
>> >> >> +		rv = siw_sq_start(qp);
>> >> >> +
>> >> >> +	return rv;
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * Map memory access error to DDP tagged error
>> >> >> + */
>> >> >> +enum ddp_ecode siw_tagged_error(enum siw_access_state state)
>> >> >> +{
>> >> >> +	if (state == E_STAG_INVALID)
>> >> >> +		return DDP_ECODE_T_INVALID_STAG;
>> >> >> +	if (state == E_BASE_BOUNDS)
>> >> >> +		return DDP_ECODE_T_BASE_BOUNDS;
>> >> >> +	if (state == E_PD_MISMATCH)
>> >> >> +		return DDP_ECODE_T_STAG_NOT_ASSOC;
>> >> >> +	if (state == E_ACCESS_PERM)
>> >> >> +		/*
>> >> >> +		 * RFC 5041 (DDP) lacks an ecode for insufficient access
>> >> >> +		 * permissions. 'Invalid STag' seem to be the closest
>> >> >> +		 * match though.
>> >> >> +		 */
>> >> >> +		return DDP_ECODE_T_INVALID_STAG;
>> >> >> +
>> >> >> +	WARN_ON(1);
>> >> >> +
>> >> >> +	return DDP_ECODE_T_INVALID_STAG;
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * Map memory access error to RDMAP protection error
>> >> >> + */
>> >> >> +enum rdmap_ecode siw_rdmap_error(enum siw_access_state
>state)
>> >> >> +{
>> >> >> +	if (state == E_STAG_INVALID)
>> >> >> +		return RDMAP_ECODE_INVALID_STAG;
>> >> >> +	if (state == E_BASE_BOUNDS)
>> >> >> +		return RDMAP_ECODE_BASE_BOUNDS;
>> >> >> +	if (state == E_PD_MISMATCH)
>> >> >> +		return RDMAP_ECODE_STAG_NOT_ASSOC;
>> >> >> +	if (state == E_ACCESS_PERM)
>> >> >> +		return RDMAP_ECODE_ACCESS_RIGHTS;
>> >> >> +
>> >> >> +	return RDMAP_ECODE_UNSPECIFIED;
>> >> >> +}
>> >> >> +
>> >> >> +void siw_init_terminate(struct siw_qp *qp, enum term_elayer
>> >layer,
>> >> >> +			u8 etype, u8 ecode, int in_tx)
>> >> >> +{
>> >> >> +	if (!qp->term_info.valid) {
>> >> >> +		memset(&qp->term_info, 0, sizeof(qp->term_info));
>> >> >> +		qp->term_info.layer = layer;
>> >> >> +		qp->term_info.etype = etype;
>> >> >> +		qp->term_info.ecode = ecode;
>> >> >> +		qp->term_info.in_tx = in_tx;
>> >> >> +		qp->term_info.valid = 1;
>> >> >> +	}
>> >> >> +	siw_dbg_qp(qp,
>> >> >> +		   "init TERM: layer %d, type %d, code %d, in tx %s\n",
>> >> >> +		   layer, etype, ecode, in_tx ? "yes" : "no");
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * Send a TERMINATE message, as defined in RFC's
>> >> >5040/5041/5044/6581.
>> >> >> + * Sending TERMINATE messages is best effort - such messages
>> >> >> + * can only be send if the QP is still connected and it does
>> >> >> + * not have another outbound message in-progress, i.e. the
>> >> >> + * TERMINATE message must not interfer with an incomplete
>> >current
>> >> >> + * transmit operation.
>> >> >> + */
>> >> >> +void siw_send_terminate(struct siw_qp *qp)
>> >> >> +{
>> >> >> +	struct kvec		iov[3];
>> >> >> +	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT|MSG_EOR};
>> >> >> +	struct iwarp_terminate	*term = NULL;
>> >> >> +	union iwarp_hdr		*err_hdr = NULL;
>> >> >> +	struct socket		*s = qp->attrs.sk;
>> >> >> +	struct siw_iwarp_rx	*rx_ctx = &qp->rx_ctx;
>> >> >> +	union iwarp_hdr		*rx_hdr = &rx_ctx->hdr;
>> >> >> +	u32 crc = 0;
>> >> >> +	int num_frags, len_terminate, rv;
>> >> >> +
>> >> >> +	if (!qp->term_info.valid)
>> >> >> +		return;
>> >> >> +
>> >> >> +	qp->term_info.valid = 0;
>> >> >> +
>> >> >> +	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
>> >> >> +		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in
>progress\n",
>> >> >> +			   tx_type(tx_wqe(qp)));
>> >> >> +		return;
>> >> >> +	}
>> >> >> +	if (!s && qp->cep)
>> >> >> +		/* QP not yet in RTS. Take socket from connection end
>point
>> >*/
>> >> >> +		s = qp->cep->llp.sock;
>> >> >> +
>> >> >> +	if (!s) {
>> >> >> +		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
>> >> >> +		return;
>> >> >> +	}
>> >> >> +
>> >> >> +	term = kzalloc(sizeof(*term), GFP_KERNEL);
>> >> >> +	if (!term)
>> >> >> +		return;
>> >> >> +
>> >> >> +	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
>> >> >> +	term->ddp_mo = 0;
>> >> >> +	term->ddp_msn = cpu_to_be32(1);
>> >> >> +
>> >> >> +	iov[0].iov_base = term;
>> >> >> +	iov[0].iov_len = sizeof(*term);
>> >> >> +
>> >> >> +	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
>> >> >> +	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
>> >> >> +	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
>> >> >> +		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
>> >> >> +		if (!err_hdr) {
>> >> >> +			kfree(term);
>> >> >> +			return;
>> >> >> +		}
>> >> >> +	}
>> >> >> +	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
>> >> >> +	       sizeof(struct iwarp_ctrl));
>> >> >> +
>> >> >> +	__rdmap_term_set_layer(term, qp->term_info.layer);
>> >> >> +	__rdmap_term_set_etype(term, qp->term_info.etype);
>> >> >> +	__rdmap_term_set_ecode(term, qp->term_info.ecode);
>> >> >> +
>> >> >> +	switch (qp->term_info.layer) {
>> >> >> +
>> >> >> +	case TERM_ERROR_LAYER_RDMAP:
>> >> >> +		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
>> >> >> +			/* No additional DDP/RDMAP header to be included */
>> >> >> +			break;
>> >> >> +
>> >> >> +		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION)
>{
>> >> >> +			/*
>> >> >> +			 * Complete RDMAP frame will get attached, and
>> >> >> +			 * DDP segment length is valid
>> >> >> +			 */
>> >> >> +			term->flag_m = 1;
>> >> >> +			term->flag_d = 1;
>> >> >> +			term->flag_r = 1;
>> >> >> +
>> >> >> +			if (qp->term_info.in_tx) {
>> >> >> +				struct iwarp_rdma_rreq *rreq;
>> >> >> +				struct siw_wqe *wqe = tx_wqe(qp);
>> >> >> +
>> >> >> +				/* Inbound RREQ error, detected during
>> >> >> +				 * RRESP creation. Take state from
>> >> >> +				 * current TX work queue element to
>> >> >> +				 * reconstruct peers RREQ.
>> >> >> +				 */
>> >> >> +				rreq = (struct iwarp_rdma_rreq *)err_hdr;
>> >> >> +
>> >> >> +				memcpy(&rreq->ctrl,
>> >> >> +				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
>> >> >> +				       sizeof(struct iwarp_ctrl));
>> >> >> +
>> >> >> +				rreq->rsvd = 0;
>> >> >> +				rreq->ddp_qn =
>> >> >> +					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
>> >> >> +
>> >> >> +				/* Provide RREQ's MSN as kept aside */
>> >> >> +				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
>> >> >> +
>> >> >> +				rreq->ddp_mo = htonl(wqe->processed);
>> >> >> +				rreq->sink_stag = htonl(wqe->sqe.rkey);
>> >> >> +				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
>> >> >> +				rreq->read_size = htonl(wqe->sqe.sge[0].length);
>> >> >> +				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
>> >> >> +				rreq->source_to =
>> >> >> +					cpu_to_be64(wqe->sqe.sge[0].laddr);
>> >> >> +
>> >> >> +				iov[1].iov_base = rreq;
>> >> >> +				iov[1].iov_len = sizeof(*rreq);
>> >> >> +
>> >> >> +				rx_hdr = (union iwarp_hdr *)rreq;
>> >> >> +			} else {
>> >> >> +				/* Take RDMAP/DDP information from
>> >> >> +				 * current (failed) inbound frame.
>> >> >> +				 */
>> >> >> +				iov[1].iov_base = rx_hdr;
>> >> >> +
>> >> >> +				if (__rdmap_opcode(&rx_hdr->ctrl) ==
>> >> >> +				    RDMAP_RDMA_READ_REQ)
>> >> >> +					iov[1].iov_len =
>> >> >> +						sizeof(struct iwarp_rdma_rreq);
>> >> >> +				else /* SEND type */
>> >> >> +					iov[1].iov_len =
>> >> >> +						sizeof(struct iwarp_send);
>> >> >> +			}
>> >> >> +		} else {
>> >> >> +			/* Do not report DDP hdr information if packet
>> >> >> +			 * layout is unknown
>> >> >> +			 */
>> >> >> +			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
>> >> >> +			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
>> >> >> +				break;
>> >> >> +
>> >> >> +			iov[1].iov_base = rx_hdr;
>> >> >> +
>> >> >> +			/* Only DDP frame will get attached */
>> >> >> +			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
>> >> >> +				iov[1].iov_len =
>> >> >> +					sizeof(struct iwarp_rdma_write);
>> >> >> +			else
>> >> >> +				iov[1].iov_len = sizeof(struct iwarp_send);
>> >> >> +
>> >> >> +			term->flag_m = 1;
>> >> >> +			term->flag_d = 1;
>> >> >> +		}
>> >> >> +		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
>> >> >> +
>> >> >> +		break;
>> >> >> +
>> >> >> +	case TERM_ERROR_LAYER_DDP:
>> >> >> +		/* Report error encountered while DDP processing.
>> >> >> +		 * This can only happen as a result of inbound
>> >> >> +		 * DDP processing
>> >> >> +		 */
>> >> >> +
>> >> >> +		/* Do not report DDP hdr information if packet
>> >> >> +		 * layout is unknown
>> >> >> +		 */
>> >> >> +		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
>> >> >> +		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
>> >> >> +		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
>> >> >> +		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
>> >> >> +			break;
>> >> >> +
>> >> >> +		iov[1].iov_base = rx_hdr;
>> >> >> +
>> >> >> +		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
>> >> >> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
>> >> >> +		else
>> >> >> +			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
>> >> >> +
>> >> >> +		term->flag_m = 1;
>> >> >> +		term->flag_d = 1;
>> >> >> +
>> >> >> +		break;
>> >> >> +
>> >> >> +	default:
>> >> >> +		break;
>> >> >> +
>> >> >> +	}
>> >> >> +	if (term->flag_m || term->flag_d || term->flag_r) {
>> >> >> +		iov[2].iov_base = &crc;
>> >> >> +		iov[2].iov_len = sizeof(crc);
>> >> >> +		len_terminate = sizeof(*term) + iov[1].iov_len +
>> >MPA_CRC_SIZE;
>> >> >> +		num_frags = 3;
>> >> >> +	} else {
>> >> >> +		iov[1].iov_base = &crc;
>> >> >> +		iov[1].iov_len = sizeof(crc);
>> >> >> +		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
>> >> >> +		num_frags = 2;
>> >> >> +	}
>> >> >> +
>> >> >> +	/* Adjust DDP Segment Length parameter, if valid */
>> >> >> +	if (term->flag_m) {
>> >> >> +		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
>> >> >> +		enum rdma_opcode op = __rdmap_opcode(&rx_hdr->ctrl);
>> >> >> +
>> >> >> +		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
>> >> >> +		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
>> >> >> +	}
>> >> >> +
>> >> >> +	term->ctrl.mpa_len = cpu_to_be16(len_terminate -
>> >> >> +					 (MPA_HDR_SIZE + MPA_CRC_SIZE));
>> >> >> +	if (qp->tx_ctx.mpa_crc_hd) {
>> >> >> +		crypto_shash_init(rx_ctx->mpa_crc_hd);
>> >> >> +		if (siw_crc_array(rx_ctx->mpa_crc_hd, (u8
>*)iov[0].iov_base,
>> >> >> +				  iov[0].iov_len))
>> >> >> +			goto out;
>> >> >> +
>> >> >> +		if (num_frags == 3) {
>> >> >> +			if (siw_crc_array(rx_ctx->mpa_crc_hd,
>> >> >> +					  (u8 *)iov[1].iov_base,
>> >> >> +					  iov[1].iov_len))
>> >> >> +				goto out;
>> >> >> +		}
>> >> >> +		crypto_shash_final(rx_ctx->mpa_crc_hd, (u8 *)&crc);
>> >> >> +	}
>> >> >> +
>> >> >> +	rv = kernel_sendmsg(s, &msg, iov, num_frags,
>len_terminate);
>> >> >> +	siw_dbg_qp(qp,
>> >> >> +		   "sent TERM: %s, layer %d, type %d, code %d (%d
>bytes)\n",
>> >> >> +		   rv == len_terminate ? "success" : "failure",
>> >> >> +		   __rdmap_term_layer(term), __rdmap_term_etype(term),
>> >> >> +		   __rdmap_term_ecode(term), rv);
>> >> >> +out:
>> >> >> +	kfree(term);
>> >> >> +	kfree(err_hdr);
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * handle all attrs other than state
>> >> >> + */
>> >> >> +static void siw_qp_modify_nonstate(struct siw_qp *qp,
>> >> >> +				   struct siw_qp_attrs *attrs,
>> >> >> +				   enum siw_qp_attr_mask mask)
>> >> >> +{
>> >> >> +	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
>> >> >> +		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
>> >> >> +			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
>> >> >> +		else
>> >> >> +			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
>> >> >> +
>> >> >> +		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
>> >> >> +			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
>> >> >> +		else
>> >> >> +			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
>> >> >> +
>> >> >> +		if (attrs->flags & SIW_RDMA_READ_ENABLED)
>> >> >> +			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
>> >> >> +		else
>> >> >> +			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
>> >> >> +	}
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * caller holds qp->state_lock
>> >> >> + */
>> >> >> +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs
>> >*attrs,
>> >> >> +		  enum siw_qp_attr_mask mask)
>> >> >> +{
>> >> >> +	int	drop_conn = 0, rv = 0;
>> >> >> +
>> >> >> +	if (!mask)
>> >> >> +		return 0;
>> >> >> +
>> >> >> +	siw_dbg_qp(qp, "state: %s => %s\n",
>> >> >> +		   siw_qp_state_to_string[qp->attrs.state],
>> >> >> +		   siw_qp_state_to_string[attrs->state]);
>> >> >> +
>> >> >> +	if (mask != SIW_QP_ATTR_STATE)
>> >> >> +		siw_qp_modify_nonstate(qp, attrs, mask);
>> >> >> +
>> >> >> +	if (!(mask & SIW_QP_ATTR_STATE))
>> >> >> +		return 0;
>> >> >> +
>> >> >> +	switch (qp->attrs.state) {
>> >> >> +
>> >> >> +	case SIW_QP_STATE_IDLE:
>> >> >> +	case SIW_QP_STATE_RTR:
>> >> >> +
>> >> >> +		switch (attrs->state) {
>> >> >> +
>> >> >> +		case SIW_QP_STATE_RTS:
>> >> >> +
>> >> >> +			if (attrs->flags & SIW_MPA_CRC) {
>> >> >> +				rv = siw_qp_enable_crc(qp);
>> >> >> +				if (rv)
>> >> >> +					break;
>> >> >> +			}
>> >> >> +			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
>> >> >> +				siw_dbg_qp(qp, "no socket\n");
>> >> >> +				rv = -EINVAL;
>> >> >> +				break;
>> >> >> +			}
>> >> >> +			if (!(mask & SIW_QP_ATTR_MPA)) {
>> >> >> +				siw_dbg_qp(qp, "no MPA\n");
>> >> >> +				rv = -EINVAL;
>> >> >> +				break;
>> >> >> +			}
>> >> >> +			siw_dbg_qp(qp, "enter rts, peer 0x%08x, loc 0x%08x\n",
>> >> >> +				   qp->cep->llp.raddr.sin_addr.s_addr,
>> >> >> +				   qp->cep->llp.laddr.sin_addr.s_addr);
>> >> >> +			/*
>> >> >> +			 * Initialize iWARP TX state
>> >> >> +			 */
>> >> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
>> >> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
>> >> >> +			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
>> >> >> +
>> >> >> +			/*
>> >> >> +			 * Initialize iWARP RX state
>> >> >> +			 */
>> >> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
>> >> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
>> >> >> +			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
>> >> >> +
>> >> >> +			/*
>> >> >> +			 * init IRD free queue, caller has already checked
>> >> >> +			 * limits.
>> >> >> +			 */
>> >> >> +			rv = siw_qp_readq_init(qp, attrs->irq_size,
>> >> >> +					       attrs->orq_size);
>> >> >> +			if (rv)
>> >> >> +				break;
>> >> >> +
>> >> >> +			qp->attrs.sk = attrs->sk;
>> >> >> +			qp->attrs.state = SIW_QP_STATE_RTS;
>> >> >> +
>> >> >> +			break;
>> >> >> +
>> >> >> +		case SIW_QP_STATE_ERROR:
>> >> >> +			siw_rq_flush(qp);
>> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> >> +			if (qp->cep) {
>> >> >> +				siw_cep_put(qp->cep);
>> >> >> +				qp->cep = NULL;
>> >> >> +			}
>> >> >> +			break;
>> >> >> +
>> >> >> +		case SIW_QP_STATE_RTR:
>> >> >> +			/* ignore */
>> >> >> +			break;
>> >> >> +
>> >> >> +		default:
>> >> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> >> >> +				   siw_qp_state_to_string[qp->attrs.state],
>> >> >> +				   siw_qp_state_to_string[attrs->state]);
>> >> >> +			break;
>> >> >> +		}
>> >> >> +		break;
>> >> >> +
>> >> >> +	case SIW_QP_STATE_RTS:
>> >> >> +
>> >> >> +		switch (attrs->state) {
>> >> >> +
>> >> >> +		case SIW_QP_STATE_CLOSING:
>> >> >> +			/*
>> >> >> +			 * Verbs: move to IDLE if SQ and ORQ are empty.
>> >> >> +			 * Move to ERROR otherwise. But first of all we must
>> >> >> +			 * close the connection. So we keep CLOSING or ERROR
>> >> >> +			 * as a transient state, schedule connection drop work
>> >> >> +			 * and wait for the socket state change upcall to
>> >> >> +			 * come back closed.
>> >> >> +			 */
>> >> >> +			if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
>> >> >> +				qp->attrs.state = SIW_QP_STATE_CLOSING;
>> >> >> +			} else {
>> >> >> +				qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> >> +				siw_sq_flush(qp);
>> >> >> +			}
>> >> >> +			siw_rq_flush(qp);
>> >> >> +
>> >> >> +			drop_conn = 1;
>> >> >> +			break;
>> >> >> +
>> >> >> +		case SIW_QP_STATE_TERMINATE:
>> >> >> +			qp->attrs.state = SIW_QP_STATE_TERMINATE;
>> >> >> +
>> >> >> +			siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
>> >> >> +					   RDMAP_ETYPE_CATASTROPHIC,
>> >> >> +					   RDMAP_ECODE_UNSPECIFIED, 1);
>> >> >> +			drop_conn = 1;
>> >> >> +
>> >> >> +			break;
>> >> >> +
>> >> >> +		case SIW_QP_STATE_ERROR:
>> >> >> +			/*
>> >> >> +			 * This is an emergency close.
>> >> >> +			 *
>> >> >> +			 * Any in progress transmit operation will get
>> >> >> +			 * cancelled.
>> >> >> +			 * This will likely result in a protocol failure,
>> >> >> +			 * if a TX operation is in transit. The caller
>> >> >> +			 * could unconditional wait to give the current
>> >> >> +			 * operation a chance to complete.
>> >> >> +			 * Esp., how to handle the non-empty IRQ case?
>> >> >> +			 * The peer was asking for data transfer at a valid
>> >> >> +			 * point in time.
>> >> >> +			 */
>> >> >> +			siw_sq_flush(qp);
>> >> >> +			siw_rq_flush(qp);
>> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> >> +			drop_conn = 1;
>> >> >> +
>> >> >> +			break;
>> >> >> +
>> >> >> +		default:
>> >> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> >> >> +				   siw_qp_state_to_string[qp->attrs.state],
>> >> >> +				   siw_qp_state_to_string[attrs->state]);
>> >> >> +			break;
>> >> >> +		}
>> >> >> +		break;
>> >> >> +
>> >> >> +	case SIW_QP_STATE_TERMINATE:
>> >> >> +
>> >> >> +		switch (attrs->state) {
>> >> >> +
>> >> >> +		case SIW_QP_STATE_ERROR:
>> >> >> +			siw_rq_flush(qp);
>> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> >> +
>> >> >> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
>> >> >> +				siw_sq_flush(qp);
>> >> >> +
>> >> >> +			break;
>> >> >> +
>> >> >> +		default:
>> >> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> >> >> +				   siw_qp_state_to_string[qp->attrs.state],
>> >> >> +				   siw_qp_state_to_string[attrs->state]);
>> >> >> +		}
>> >> >> +		break;
>> >> >> +
>> >> >> +	case SIW_QP_STATE_CLOSING:
>> >> >> +
>> >> >> +		switch (attrs->state) {
>> >> >> +
>> >> >> +		case SIW_QP_STATE_IDLE:
>> >> >> +			WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
>> >> >> +			qp->attrs.state = SIW_QP_STATE_IDLE;
>> >> >> +
>> >> >> +			break;
>> >> >> +
>> >> >> +		case SIW_QP_STATE_CLOSING:
>> >> >> +			/*
>> >> >> +			 * The LLP may already moved the QP to closing
>> >> >> +			 * due to graceful peer close init
>> >> >> +			 */
>> >> >> +			break;
>> >> >> +
>> >> >> +		case SIW_QP_STATE_ERROR:
>> >> >> +			/*
>> >> >> +			 * QP was moved to CLOSING by LLP event
>> >> >> +			 * not yet seen by user.
>> >> >> +			 */
>> >> >> +			qp->attrs.state = SIW_QP_STATE_ERROR;
>> >> >> +
>> >> >> +			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
>> >> >> +				siw_sq_flush(qp);
>> >> >> +
>> >> >> +			siw_rq_flush(qp);
>> >> >> +
>> >> >> +			break;
>> >> >> +
>> >> >> +		default:
>> >> >> +			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
>> >> >> +				   siw_qp_state_to_string[qp->attrs.state],
>> >> >> +				   siw_qp_state_to_string[attrs->state]);
>> >> >> +
>> >> >> +			return -ECONNABORTED;
>> >> >> +		}
>> >> >> +		break;
>> >> >> +
>> >> >> +	default:
>> >> >> +		siw_dbg_qp(qp, " noop: state %s\n",
>> >> >> +			   siw_qp_state_to_string[qp->attrs.state]);
>> >> >> +		break;
>> >> >> +	}
>> >> >> +	if (drop_conn)
>> >> >> +		siw_qp_cm_drop(qp, 0);
>> >> >> +
>> >> >> +	return rv;
>> >> >> +}
>> >> >> +
>> >> >> +struct ib_qp *siw_get_base_qp(struct ib_device *base_dev,
>int
>> >id)
>> >> >> +{
>> >> >> +	struct siw_qp *qp =  siw_qp_id2obj(to_siw_dev(base_dev),
>id);
>> >> >> +
>> >> >> +	if (qp) {
>> >> >> +		/*
>> >> >> +		 * siw_qp_id2obj() increments object reference count
>> >> >> +		 */
>> >> >> +		siw_qp_put(qp);
>> >> >> +		siw_dbg_qp(qp, "got base QP");
>> >> >> +
>> >> >> +		return &qp->base_qp;
>> >> >> +	}
>> >> >> +	return (struct ib_qp *)NULL;
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * siw_check_mem()
>> >> >> + *
>> >> >> + * Check protection domain, STAG state, access permissions
>and
>> >> >> + * address range for memory object.
>> >> >> + *
>> >> >> + * @pd:		Protection Domain memory should belong to
>> >> >> + * @mem:	memory to be checked
>> >> >> + * @addr:	starting addr of mem
>> >> >> + * @perms:	requested access permissions
>> >> >> + * @len:	len of memory interval to be checked
>> >> >> + *
>> >> >> + */
>> >> >> +int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem,
>u64
>> >> >addr,
>> >> >> +		  enum siw_access_flags perms, int len)
>> >> >> +{
>> >> >> +	if (siw_mem2mr(mem)->pd != pd) {
>> >> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: pd mismatch\n",
>OBJ_ID(pd));
>> >> >> +		return -E_PD_MISMATCH;
>> >> >> +	}
>> >> >> +	if (!mem->stag_valid) {
>> >> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: stag 0x%08x invalid\n",
>> >> >> +			OBJ_ID(pd), OBJ_ID(mem));
>> >> >> +		return -E_STAG_INVALID;
>> >> >> +	}
>> >> >> +	/*
>> >> >> +	 * check access permissions
>> >> >> +	 */
>> >> >> +	if ((mem->perms & perms) < perms) {
>> >> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: permissions 0x%08x <
>> >0x%08x\n",
>> >> >> +			OBJ_ID(pd), mem->perms, perms);
>> >> >> +		return -E_ACCESS_PERM;
>> >> >> +	}
>> >> >> +	/*
>> >> >> +	 * Check address interval: we relax check to allow memory
>> >> >shrinked
>> >> >> +	 * from the start address _after_ placing or fetching len
>> >bytes.
>> >> >> +	 * TODO: this relaxation is probably overdone
>> >> >> +	 */
>> >> >> +	if (addr < mem->va || addr + len > mem->va + mem->len) {
>> >> >> +		siw_dbg(pd->hdr.sdev, "[PD %d]: MEM interval len %d\n",
>> >> >> +			OBJ_ID(pd), len);
>> >> >> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx) out of
>> >bounds\n",
>> >> >> +			(unsigned long long)addr,
>> >> >> +			(unsigned long long)(addr + len));
>> >> >> +		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx]
>LKey=0x%08x\n",
>> >> >> +			(unsigned long long)mem->va,
>> >> >> +			(unsigned long long)(mem->va + mem->len),
>> >> >> +			OBJ_ID(mem));
>> >> >> +
>> >> >> +		return -E_BASE_BOUNDS;
>> >> >> +	}
>> >> >> +	return E_ACCESS_OK;
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * siw_check_sge()
>> >> >> + *
>> >> >> + * Check SGE for access rights in given interval
>> >> >> + *
>> >> >> + * @pd:		Protection Domain memory should belong to
>> >> >> + * @sge:	SGE to be checked
>> >> >> + * @mem:	array of memory references
>> >> >> + * @perms:	requested access permissions
>> >> >> + * @off:	starting offset in SGE
>> >> >> + * @len:	len of memory interval to be checked
>> >> >> + *
>> >> >> + * NOTE: Function references SGE's memory object (mem->obj)
>> >> >> + * if not yet done. New reference is kept if check went ok
>and
>> >> >> + * released if check failed. If mem->obj is already valid,
>no
>> >new
>> >> >> + * lookup is being done and mem is not released it check
>fails.
>> >> >> + */
>> >> >> +int
>> >> >> +siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
>> >> >> +	      struct siw_mem *mem[], enum siw_access_flags perms,
>> >> >> +	      u32 off, int len)
>> >> >> +{
>> >> >> +	struct siw_device *sdev = pd->hdr.sdev;
>> >> >> +	int new_ref = 0, rv = E_ACCESS_OK;
>> >> >> +
>> >> >> +	if (len + off > sge->length) {
>> >> >> +		rv = -E_BASE_BOUNDS;
>> >> >> +		goto fail;
>> >> >> +	}
>> >> >> +	if (*mem == NULL) {
>> >> >> +		*mem = siw_mem_id2obj(sdev, sge->lkey >> 8);
>> >> >> +		if (*mem == NULL) {
>> >> >> +			rv = -E_STAG_INVALID;
>> >> >> +			goto fail;
>> >> >> +		}
>> >> >> +		new_ref = 1;
>> >> >> +	}
>> >> >> +
>> >> >> +	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
>> >> >> +	if (rv)
>> >> >> +		goto fail;
>> >> >> +
>> >> >> +	return 0;
>> >> >> +
>> >> >> +fail:
>> >> >> +	if (new_ref) {
>> >> >> +		siw_mem_put(*mem);
>> >> >> +		*mem = NULL;
>> >> >> +	}
>> >> >> +	return rv;
>> >> >> +}
>> >> >> +
>> >> >> +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe
>*sqe)
>> >> >> +{
>> >> >> +	rreq->id = sqe->id;
>> >> >> +	rreq->opcode = sqe->opcode;
>> >> >> +	rreq->sge[0].laddr = sqe->sge[0].laddr;
>> >> >> +	rreq->sge[0].length = sqe->sge[0].length;
>> >> >> +	rreq->sge[0].lkey = sqe->sge[0].lkey;
>> >> >> +	rreq->sge[1].lkey = sqe->sge[1].lkey;
>> >> >> +	rreq->flags = sqe->flags | SIW_WQE_VALID;
>> >> >> +	rreq->num_sge = 1;
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * Must be called with SQ locked.
>> >> >> + * To avoid complete SQ starvation by constant inbound READ
>> >> >requests,
>> >> >> + * the active IRQ will not be served after qp->irq_burst, if
>> >the
>> >> >> + * SQ has pending work.
>> >> >> + */
>> >> >> +int siw_activate_tx(struct siw_qp *qp)
>> >> >> +{
>> >> >> +	struct siw_sqe	*irqe, *sqe;
>> >> >> +	struct siw_wqe	*wqe = tx_wqe(qp);
>> >> >> +	int rv = 1;
>> >> >> +
>> >> >> +	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
>> >> >> +
>> >> >> +	if (irqe->flags & SIW_WQE_VALID) {
>> >> >> +		sqe = sq_get_next(qp);
>> >> >> +
>> >> >> +		/*
>> >> >> +		 * Avoid local WQE processing starvation in case
>> >> >> +		 * of constant inbound READ request stream
>> >> >> +		 */
>> >> >> +		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE)
>{
>> >> >> +			qp->irq_burst = 0;
>> >> >> +			goto skip_irq;
>> >> >> +		}
>> >> >> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> >> >> +		wqe->wr_status = SIW_WR_QUEUED;
>> >> >> +
>> >> >> +		/* start READ RESPONSE */
>> >> >> +		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
>> >> >> +		wqe->sqe.flags = 0;
>> >> >> +		if (irqe->num_sge) {
>> >> >> +			wqe->sqe.num_sge = 1;
>> >> >> +			wqe->sqe.sge[0].length = irqe->sge[0].length;
>> >> >> +			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
>> >> >> +			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
>> >> >> +		} else {
>> >> >> +			wqe->sqe.num_sge = 0;
>> >> >> +		}
>> >> >> +
>> >> >> +		/* Retain original RREQ's message sequence number for
>> >> >> +		 * potential error reporting cases.
>> >> >> +		 */
>> >> >> +		wqe->sqe.sge[1].length = irqe->sge[1].length;
>> >> >> +
>> >> >> +		wqe->sqe.rkey = irqe->rkey;
>> >> >> +		wqe->sqe.raddr = irqe->raddr;
>> >> >> +
>> >> >> +		wqe->processed = 0;
>> >> >> +		qp->irq_get++;
>> >> >> +
>> >> >> +		/* mark current IRQ entry free */
>> >> >> +		smp_store_mb(irqe->flags, 0);
>> >> >> +
>> >> >> +		goto out;
>> >> >> +	}
>> >> >> +
>> >> >> +	sqe = sq_get_next(qp);
>> >> >> +	if (sqe) {
>> >> >> +skip_irq:
>> >> >> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> >> >> +		wqe->wr_status = SIW_WR_QUEUED;
>> >> >> +
>> >> >> +		/* First copy SQE to kernel private memory */
>> >> >> +		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
>> >> >> +
>> >> >> +		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
>> >> >> +			rv = -EINVAL;
>> >> >> +			goto out;
>> >> >> +		}
>> >> >> +		if (wqe->sqe.flags & SIW_WQE_INLINE) {
>> >> >> +			if (wqe->sqe.opcode != SIW_OP_SEND &&
>> >> >> +			    wqe->sqe.opcode != SIW_OP_WRITE) {
>> >> >> +				rv = -EINVAL;
>> >> >> +				goto out;
>> >> >> +			}
>> >> >> +			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
>> >> >> +				rv = -EINVAL;
>> >> >> +				goto out;
>> >> >> +			}
>> >> >> +			wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
>> >> >> +			wqe->sqe.sge[0].lkey = 0;
>> >> >> +			wqe->sqe.num_sge = 1;
>> >> >> +		}
>> >> >> +		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
>> >> >> +			/* A READ cannot be fenced */
>> >> >> +			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
>> >> >> +			    wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV)) {
>> >> >> +				siw_dbg_qp(qp, "cannot fence read\n");
>> >> >> +				rv = -EINVAL;
>> >> >> +				goto out;
>> >> >> +			}
>> >> >> +			spin_lock(&qp->orq_lock);
>> >> >> +
>> >> >> +			if (!siw_orq_empty(qp)) {
>> >> >> +				qp->tx_ctx.orq_fence = 1;
>> >> >> +				rv = 0;
>> >> >> +			}
>> >> >> +			spin_unlock(&qp->orq_lock);
>> >> >> +
>> >> >> +		} else if (wqe->sqe.opcode == SIW_OP_READ ||
>> >> >> +			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
>> >> >> +			struct siw_sqe	*rreq;
>> >> >> +
>> >> >> +			wqe->sqe.num_sge = 1;
>> >> >> +
>> >> >> +			spin_lock(&qp->orq_lock);
>> >> >> +
>> >> >> +			rreq = orq_get_free(qp);
>> >> >> +			if (rreq) {
>> >> >> +				/*
>> >> >> +				 * Make an immediate copy in ORQ to be ready
>> >> >> +				 * to process loopback READ reply
>> >> >> +				 */
>> >> >> +				siw_read_to_orq(rreq, &wqe->sqe);
>> >> >> +				qp->orq_put++;
>> >> >> +			} else {
>> >> >> +				qp->tx_ctx.orq_fence = 1;
>> >> >> +				rv = 0;
>> >> >> +			}
>> >> >> +			spin_unlock(&qp->orq_lock);
>> >> >> +		}
>> >> >> +
>> >> >> +		/* Clear SQE, can be re-used by application */
>> >> >> +		smp_store_mb(sqe->flags, 0);
>> >> >> +		qp->sq_get++;
>> >> >> +	} else {
>> >> >> +		rv = 0;
>> >> >> +	}
>> >> >> +out:
>> >> >> +	if (unlikely(rv < 0)) {
>> >> >> +		siw_dbg_qp(qp, "error %d\n", rv);
>> >> >> +		wqe->wr_status = SIW_WR_IDLE;
>> >> >> +	}
>> >> >> +	return rv;
>> >> >> +}
>> >> >> +
>> >> >> +/*
>> >> >> + * Check if current CQ state qualifies for
>> >> >> + * calling CQ completion handler. Must be
>> >> >> + * called with CQ lock held.
>> >> >> + */
>> >> >> +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
>> >> >> +{
>> >> >> +	u64 cq_notify;
>> >> >> +
>> >> >> +	if (!cq->base_cq.comp_handler)
>> >> >> +		return false;
>> >> >> +
>> >> >> +	cq_notify = READ_ONCE(*cq->notify);
>> >> >> +
>> >> >> +	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
>> >> >> +	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
>> >> >> +	     (flags & SIW_WQE_SOLICITED))) {
>> >> >> +		/* dis-arm CQ */
>> >> >> +		smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
>> >> >> +
>> >> >> +		return true;
>> >> >> +	}
>> >> >> +	return false;
>> >> >> +}
>> >> >> +
>> >> >> +/* Must be called without holding CQ lock */
>> >> >> +static inline void siw_cq_completion(struct siw_cq *cq)
>> >> >> +{
>> >> >> +	siw_dbg_obj(cq, "Completion\n");
>> >> >> +	(*cq->base_cq.comp_handler)(&cq->base_cq,
>> >> >cq->base_cq.cq_context);
>> >> >> +}
>> >> >> +
>> >> >> +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe,
>> >u32
>> >> >bytes,
>> >> >> +		     enum siw_wc_status status)
>> >> >> +{
>> >> >> +	struct siw_cq *cq = qp->scq;
>> >> >> +	int rv = 0;
>> >> >> +
>> >> >> +	if (cq) {
>> >> >> +		u32 sqe_flags = sqe->flags;
>> >> >> +		struct siw_cqe *cqe;
>> >> >> +		u32 idx;
>> >> >> +		unsigned long flags;
>> >> >> +
>> >> >> +		spin_lock_irqsave(&cq->lock, flags);
>> >> >> +
>> >> >> +		idx = cq->cq_put % cq->num_cqe;
>> >> >> +		cqe = &cq->queue[idx];
>> >> >> +
>> >> >> +		if (!READ_ONCE(cqe->flags)) {
>> >> >> +			bool notify;
>> >> >> +
>> >> >> +			cqe->id = sqe->id;
>> >> >> +			cqe->opcode = sqe->opcode;
>> >> >> +			cqe->status = status;
>> >> >> +			cqe->imm_data = 0;
>> >> >> +			cqe->bytes = bytes;
>> >> >> +
>> >> >> +			if (cq->kernel_verbs) {
>> >> >
>> >> >kernel_verbs is managed by Ib/core, why should driver know
>about
>> >it?
>> >> >
>> >> User land CQE's carry the corresponding QP ID, kernel clients
>> >expect a QP
>> >> pointer here. That's where the difference comes from. This
>> >distinction
>> >> between kernel and user clients is needed in more places, e.g.
>> >since the
>> >> user land's CQE array is memory mapped, where the kernel land's
>is
>> >not.
>> >
>> >It is passed through udata, your code should check existence of
>> >"udata"
>> >and not manages user/kernel flag.
>> >
>> >
>>
>> Right. I simply wanted to avoid those potentially cache thrashing
>> efforts - the udata pointer of the ib_device is quite far away
>> from the CQ array. So we kind of trade redundancy for performance,
>> and CQE creation is definitively on the fast path...
>
>Do you have any performance numbers to support need of this
>user/kernel flag?
>Also rdma_is_kernel_res() can give you very fast answer if you have
>user
>or kernel object.
>
No, sorry, I don't have performance numbers for that claim.
Jason Gunthorpe March 6, 2019, 7:51 p.m. UTC | #7
On Tue, Feb 19, 2019 at 11:08:58AM +0100, Bernard Metzler wrote:
> +/*
> + * Must be called with SQ locked.
> + * To avoid complete SQ starvation by constant inbound READ requests,
> + * the active IRQ will not be served after qp->irq_burst, if the
> + * SQ has pending work.
> + */
> +int siw_activate_tx(struct siw_qp *qp)
> +{
> +	struct siw_sqe	*irqe, *sqe;
> +	struct siw_wqe	*wqe = tx_wqe(qp);
> +	int rv = 1;
> +
> +	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
> +
> +	if (irqe->flags & SIW_WQE_VALID) {
> +		sqe = sq_get_next(qp);
> +
> +		/*
> +		 * Avoid local WQE processing starvation in case
> +		 * of constant inbound READ request stream
> +		 */
> +		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
> +			qp->irq_burst = 0;
> +			goto skip_irq;
> +		}
> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
> +		wqe->wr_status = SIW_WR_QUEUED;
> +
> +		/* start READ RESPONSE */
> +		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
> +		wqe->sqe.flags = 0;
> +		if (irqe->num_sge) {
> +			wqe->sqe.num_sge = 1;
> +			wqe->sqe.sge[0].length = irqe->sge[0].length;
> +			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
> +			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
> +		} else {
> +			wqe->sqe.num_sge = 0;
> +		}
> +
> +		/* Retain original RREQ's message sequence number for
> +		 * potential error reporting cases.
> +		 */
> +		wqe->sqe.sge[1].length = irqe->sge[1].length;
> +
> +		wqe->sqe.rkey = irqe->rkey;
> +		wqe->sqe.raddr = irqe->raddr;
> +
> +		wqe->processed = 0;
> +		qp->irq_get++;
> +
> +		/* mark current IRQ entry free */
> +		smp_store_mb(irqe->flags, 0);

I really dislike seeing attempts at lock-free constructions like this
in drivers... 

.. and I'm having doubts it is right, as directly above we read
irqe->flags without a barrier/atomic/etc in the if.

What is this attempting to do, and why can't it have an standard
scheme?

'lock-free' with test_bit/set_bit/clear_bit atomics would be better
that this..

Same remark for all these magic cases.

Jason
Bernard Metzler March 7, 2019, 4:14 p.m. UTC | #8
-----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: -----

>To: "Bernard Metzler" <bmt@zurich.ibm.com>
>From: "Jason Gunthorpe" <jgg@ziepe.ca>
>Date: 03/06/2019 08:51PM
>Cc: linux-rdma@vger.kernel.org
>Subject: Re: [PATCH v5 08/13] SIW queue pair methods
>
>On Tue, Feb 19, 2019 at 11:08:58AM +0100, Bernard Metzler wrote:
>> +/*
>> + * Must be called with SQ locked.
>> + * To avoid complete SQ starvation by constant inbound READ
>requests,
>> + * the active IRQ will not be served after qp->irq_burst, if the
>> + * SQ has pending work.
>> + */
>> +int siw_activate_tx(struct siw_qp *qp)
>> +{
>> +	struct siw_sqe	*irqe, *sqe;
>> +	struct siw_wqe	*wqe = tx_wqe(qp);
>> +	int rv = 1;
>> +
>> +	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
>> +
>> +	if (irqe->flags & SIW_WQE_VALID) {
>> +		sqe = sq_get_next(qp);
>> +
>> +		/*
>> +		 * Avoid local WQE processing starvation in case
>> +		 * of constant inbound READ request stream
>> +		 */
>> +		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
>> +			qp->irq_burst = 0;
>> +			goto skip_irq;
>> +		}
>> +		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
>> +		wqe->wr_status = SIW_WR_QUEUED;
>> +
>> +		/* start READ RESPONSE */
>> +		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
>> +		wqe->sqe.flags = 0;
>> +		if (irqe->num_sge) {
>> +			wqe->sqe.num_sge = 1;
>> +			wqe->sqe.sge[0].length = irqe->sge[0].length;
>> +			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
>> +			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
>> +		} else {
>> +			wqe->sqe.num_sge = 0;
>> +		}
>> +
>> +		/* Retain original RREQ's message sequence number for
>> +		 * potential error reporting cases.
>> +		 */
>> +		wqe->sqe.sge[1].length = irqe->sge[1].length;
>> +
>> +		wqe->sqe.rkey = irqe->rkey;
>> +		wqe->sqe.raddr = irqe->raddr;
>> +
>> +		wqe->processed = 0;
>> +		qp->irq_get++;
>> +
>> +		/* mark current IRQ entry free */
>> +		smp_store_mb(irqe->flags, 0);
>
>I really dislike seeing attempts at lock-free constructions like this
>in drivers... 
>
in fact I do not need memory barriers here, since the memory is
not mapped. It is the inbound read queue, and both receive
processing and send processing accessing it under a
send queue spinlock only.

receive side is adding to the queue if a new read request
comes in, send queue processing serves the IRQ as well
as the SQ.


>.. and I'm having doubts it is right, as directly above we read
>irqe->flags without a barrier/atomic/etc in the if.
>
right

>What is this attempting to do, and why can't it have an standard
>scheme?
>
>'lock-free' with test_bit/set_bit/clear_bit atomics would be better
>that this..
>
>Same remark for all these magic cases.
>
>Jason
>
>
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c
new file mode 100644
index 000000000000..75fd151dae39
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp.c
@@ -0,0 +1,1478 @@ 
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Software iWARP device driver
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2018, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/file.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <asm/barrier.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
+	[SIW_QP_STATE_IDLE]		= "IDLE",
+	[SIW_QP_STATE_RTR]		= "RTR",
+	[SIW_QP_STATE_RTS]		= "RTS",
+	[SIW_QP_STATE_CLOSING]		= "CLOSING",
+	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
+	[SIW_QP_STATE_ERROR]		= "ERROR"
+};
+
+/*
+ * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
+ * per-RDMAP message basis. Please keep order of initializer. All MPA len
+ * is initialized to minimum packet size.
+ */
+struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { {
+	/* RDMAP_RDMA_WRITE */
+	.hdr_len = sizeof(struct iwarp_rdma_write),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
+	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
+		| cpu_to_be16(DDP_VERSION << 8)
+		| cpu_to_be16(RDMAP_VERSION << 6)
+		| cpu_to_be16(RDMAP_RDMA_WRITE),
+	.proc_data = siw_proc_write
+},
+{	/* RDMAP_RDMA_READ_REQ */
+	.hdr_len = sizeof(struct iwarp_rdma_rreq),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
+	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
+		| cpu_to_be16(DDP_VERSION << 8)
+		| cpu_to_be16(RDMAP_VERSION << 6)
+		| cpu_to_be16(RDMAP_RDMA_READ_REQ),
+	.proc_data = siw_proc_rreq
+},
+{	/* RDMAP_RDMA_READ_RESP */
+	.hdr_len = sizeof(struct iwarp_rdma_rresp),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
+	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST
+		| cpu_to_be16(DDP_VERSION << 8)
+		| cpu_to_be16(RDMAP_VERSION << 6)
+		| cpu_to_be16(RDMAP_RDMA_READ_RESP),
+	.proc_data = siw_proc_rresp
+},
+{	/* RDMAP_SEND */
+	.hdr_len = sizeof(struct iwarp_send),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
+		| cpu_to_be16(DDP_VERSION << 8)
+		| cpu_to_be16(RDMAP_VERSION << 6)
+		| cpu_to_be16(RDMAP_SEND),
+	.proc_data = siw_proc_send
+},
+{	/* RDMAP_SEND_INVAL */
+	.hdr_len = sizeof(struct iwarp_send_inv),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
+		| cpu_to_be16(DDP_VERSION << 8)
+		| cpu_to_be16(RDMAP_VERSION << 6)
+		| cpu_to_be16(RDMAP_SEND_INVAL),
+	.proc_data = siw_proc_send
+},
+{	/* RDMAP_SEND_SE */
+	.hdr_len = sizeof(struct iwarp_send),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
+		| cpu_to_be16(DDP_VERSION << 8)
+		| cpu_to_be16(RDMAP_VERSION << 6)
+		| cpu_to_be16(RDMAP_SEND_SE),
+	.proc_data = siw_proc_send
+},
+{	/* RDMAP_SEND_SE_INVAL */
+	.hdr_len = sizeof(struct iwarp_send_inv),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
+		| cpu_to_be16(DDP_VERSION << 8)
+		| cpu_to_be16(RDMAP_VERSION << 6)
+		| cpu_to_be16(RDMAP_SEND_SE_INVAL),
+	.proc_data = siw_proc_send
+},
+{	/* RDMAP_TERMINATE */
+	.hdr_len = sizeof(struct iwarp_terminate),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
+	.ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST
+		| cpu_to_be16(DDP_VERSION << 8)
+		| cpu_to_be16(RDMAP_VERSION << 6)
+		| cpu_to_be16(RDMAP_TERMINATE),
+	.proc_data = siw_proc_terminate
+} };
+
+void siw_qp_llp_data_ready(struct sock *sk)
+{
+	struct siw_qp		*qp;
+
+	read_lock(&sk->sk_callback_lock);
+
+	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
+		goto done;
+
+	qp = sk_to_qp(sk);
+
+	if (likely(!qp->rx_ctx.rx_suspend &&
+		   down_read_trylock(&qp->state_lock))) {
+		read_descriptor_t rd_desc = {.arg.data = qp, .count = 1};
+
+		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
+			/*
+			 * Implements data receive operation during
+			 * socket callback. TCP gracefully catches
+			 * the case where there is nothing to receive
+			 * (not calling siw_tcp_rx_data() then).
+			 */
+			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+
+		up_read(&qp->state_lock);
+	} else {
+		siw_dbg_qp(qp, "unable to rx, suspend: %d\n",
+			   qp->rx_ctx.rx_suspend);
+	}
+done:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+void siw_qp_llp_close(struct siw_qp *qp)
+{
+	siw_dbg_qp(qp, "enter llp close, state = %s\n",
+		   siw_qp_state_to_string[qp->attrs.state]);
+
+	down_write(&qp->state_lock);
+
+	qp->rx_ctx.rx_suspend = 1;
+	qp->tx_ctx.tx_suspend = 1;
+	qp->attrs.sk = NULL;
+
+	switch (qp->attrs.state) {
+
+	case SIW_QP_STATE_RTS:
+	case SIW_QP_STATE_RTR:
+	case SIW_QP_STATE_IDLE:
+	case SIW_QP_STATE_TERMINATE:
+
+		qp->attrs.state = SIW_QP_STATE_ERROR;
+
+		break;
+	/*
+	 * SIW_QP_STATE_CLOSING:
+	 *
+	 * This is a forced close. shall the QP be moved to
+	 * ERROR or IDLE ?
+	 */
+	case SIW_QP_STATE_CLOSING:
+		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+		else
+			qp->attrs.state = SIW_QP_STATE_IDLE;
+
+		break;
+
+	default:
+		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
+			   siw_qp_state_to_string[qp->attrs.state]);
+		break;
+	}
+	siw_sq_flush(qp);
+	siw_rq_flush(qp);
+
+	/*
+	 * Dereference closing CEP
+	 */
+	if (qp->cep) {
+		siw_cep_put(qp->cep);
+		qp->cep = NULL;
+	}
+
+	up_write(&qp->state_lock);
+
+	siw_dbg_qp(qp, "llp close exit: state %s\n",
+		   siw_qp_state_to_string[qp->attrs.state]);
+}
+
+/*
+ * socket callback routine informing about newly available send space.
+ * Function schedules SQ work for processing SQ items.
+ */
+void siw_qp_llp_write_space(struct sock *sk)
+{
+	struct siw_cep	*cep = sk_to_cep(sk);
+
+	cep->sk_write_space(sk);
+
+	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+		(void) siw_sq_start(cep->qp);
+}
+
+static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
+{
+	if (!irq_size)
+		irq_size = 1;
+	if (!orq_size)
+		orq_size = 1;
+
+	qp->attrs.irq_size = irq_size;
+	qp->attrs.orq_size = orq_size;
+
+	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
+	if (!qp->irq) {
+		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
+		qp->attrs.irq_size = 0;
+		return -ENOMEM;
+	}
+	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
+	if (!qp->orq) {
+		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
+		qp->attrs.orq_size = 0;
+		qp->attrs.irq_size = 0;
+		vfree(qp->irq);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static int siw_qp_enable_crc(struct siw_qp *qp)
+{
+	struct siw_iwarp_rx *c_rx = &qp->rx_ctx;
+	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+	int rv = 0;
+
+	if (siw_crypto_shash == NULL) {
+		rv = -ENOENT;
+		goto error;
+	}
+	c_tx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
+				   crypto_shash_descsize(siw_crypto_shash),
+				   GFP_KERNEL);
+	c_rx->mpa_crc_hd = kzalloc(sizeof(struct shash_desc) +
+				   crypto_shash_descsize(siw_crypto_shash),
+				   GFP_KERNEL);
+	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
+		rv = -ENOMEM;
+		goto error;
+	}
+	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
+	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
+
+	return 0;
+error:
+	siw_dbg_qp(qp, "falied loading crc32c. error %d\n", rv);
+
+	kfree(c_tx->mpa_crc_hd);
+	kfree(c_rx->mpa_crc_hd);
+
+	c_tx->mpa_crc_hd = c_rx->mpa_crc_hd = NULL;
+
+	return rv;
+}
+
+/*
+ * Send a non signalled READ or WRITE to peer side as negotiated
+ * with MPAv2 P2P setup protocol. The work request is only created
+ * as a current active WR and does not consume Send Queue space.
+ *
+ * Caller must hold QP state lock.
+ */
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
+{
+	struct siw_wqe	*wqe = tx_wqe(qp);
+	unsigned long flags;
+	int rv = 0;
+
+	spin_lock_irqsave(&qp->sq_lock, flags);
+
+	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+		spin_unlock_irqrestore(&qp->sq_lock, flags);
+		return -EIO;
+	}
+	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+
+	wqe->wr_status = SIW_WR_QUEUED;
+	wqe->sqe.flags = 0;
+	wqe->sqe.num_sge = 1;
+	wqe->sqe.sge[0].length = 0;
+	wqe->sqe.sge[0].laddr = 0;
+	wqe->sqe.sge[0].lkey = 0;
+	/*
+	 * While it must not be checked for inbound zero length
+	 * READ/WRITE, some HW may treat STag 0 special.
+	 */
+	wqe->sqe.rkey = 1;
+	wqe->sqe.raddr = 0;
+	wqe->processed = 0;
+
+	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
+		wqe->sqe.opcode = SIW_OP_WRITE;
+	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
+		struct siw_sqe	*rreq;
+
+		wqe->sqe.opcode = SIW_OP_READ;
+
+		spin_lock(&qp->orq_lock);
+
+		rreq = orq_get_free(qp);
+		if (rreq) {
+			siw_read_to_orq(rreq, &wqe->sqe);
+			qp->orq_put++;
+		} else
+			rv = -EIO;
+
+		spin_unlock(&qp->orq_lock);
+	} else
+		rv = -EINVAL;
+
+	if (rv)
+		wqe->wr_status = SIW_WR_IDLE;
+
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+	if (!rv)
+		rv = siw_sq_start(qp);
+
+	return rv;
+}
+
+/*
+ * Map memory access error to DDP tagged error
+ */
+enum ddp_ecode siw_tagged_error(enum siw_access_state state)
+{
+	if (state == E_STAG_INVALID)
+		return DDP_ECODE_T_INVALID_STAG;
+	if (state == E_BASE_BOUNDS)
+		return DDP_ECODE_T_BASE_BOUNDS;
+	if (state == E_PD_MISMATCH)
+		return DDP_ECODE_T_STAG_NOT_ASSOC;
+	if (state == E_ACCESS_PERM)
+		/*
+		 * RFC 5041 (DDP) lacks an ecode for insufficient access
+		 * permissions. 'Invalid STag' seem to be the closest
+		 * match though.
+		 */
+		return DDP_ECODE_T_INVALID_STAG;
+
+	WARN_ON(1);
+
+	return DDP_ECODE_T_INVALID_STAG;
+}
+
+/*
+ * Map memory access error to RDMAP protection error
+ */
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
+{
+	if (state == E_STAG_INVALID)
+		return RDMAP_ECODE_INVALID_STAG;
+	if (state == E_BASE_BOUNDS)
+		return RDMAP_ECODE_BASE_BOUNDS;
+	if (state == E_PD_MISMATCH)
+		return RDMAP_ECODE_STAG_NOT_ASSOC;
+	if (state == E_ACCESS_PERM)
+		return RDMAP_ECODE_ACCESS_RIGHTS;
+
+	return RDMAP_ECODE_UNSPECIFIED;
+}
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer,
+			u8 etype, u8 ecode, int in_tx)
+{
+	if (!qp->term_info.valid) {
+		memset(&qp->term_info, 0, sizeof(qp->term_info));
+		qp->term_info.layer = layer;
+		qp->term_info.etype = etype;
+		qp->term_info.ecode = ecode;
+		qp->term_info.in_tx = in_tx;
+		qp->term_info.valid = 1;
+	}
+	siw_dbg_qp(qp,
+		   "init TERM: layer %d, type %d, code %d, in tx %s\n",
+		   layer, etype, ecode, in_tx ? "yes" : "no");
+}
+
+/*
+ * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
+ * Sending TERMINATE messages is best effort - such messages
+ * can only be send if the QP is still connected and it does
+ * not have another outbound message in-progress, i.e. the
+ * TERMINATE message must not interfer with an incomplete current
+ * transmit operation.
+ */
+void siw_send_terminate(struct siw_qp *qp)
+{
+	struct kvec		iov[3];
+	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT|MSG_EOR};
+	struct iwarp_terminate	*term = NULL;
+	union iwarp_hdr		*err_hdr = NULL;
+	struct socket		*s = qp->attrs.sk;
+	struct siw_iwarp_rx	*rx_ctx = &qp->rx_ctx;
+	union iwarp_hdr		*rx_hdr = &rx_ctx->hdr;
+	u32 crc = 0;
+	int num_frags, len_terminate, rv;
+
+	if (!qp->term_info.valid)
+		return;
+
+	qp->term_info.valid = 0;
+
+	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
+		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
+			   tx_type(tx_wqe(qp)));
+		return;
+	}
+	if (!s && qp->cep)
+		/* QP not yet in RTS. Take socket from connection end point */
+		s = qp->cep->llp.sock;
+
+	if (!s) {
+		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
+		return;
+	}
+
+	term = kzalloc(sizeof(*term), GFP_KERNEL);
+	if (!term)
+		return;
+
+	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
+	term->ddp_mo = 0;
+	term->ddp_msn = cpu_to_be32(1);
+
+	iov[0].iov_base = term;
+	iov[0].iov_len = sizeof(*term);
+
+	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
+	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
+	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
+		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
+		if (!err_hdr) {
+			kfree(term);
+			return;
+		}
+	}
+	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
+	       sizeof(struct iwarp_ctrl));
+
+	__rdmap_term_set_layer(term, qp->term_info.layer);
+	__rdmap_term_set_etype(term, qp->term_info.etype);
+	__rdmap_term_set_ecode(term, qp->term_info.ecode);
+
+	switch (qp->term_info.layer) {
+
+	case TERM_ERROR_LAYER_RDMAP:
+		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
+			/* No additional DDP/RDMAP header to be included */
+			break;
+
+		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
+			/*
+			 * Complete RDMAP frame will get attached, and
+			 * DDP segment length is valid
+			 */
+			term->flag_m = 1;
+			term->flag_d = 1;
+			term->flag_r = 1;
+
+			if (qp->term_info.in_tx) {
+				struct iwarp_rdma_rreq *rreq;
+				struct siw_wqe *wqe = tx_wqe(qp);
+
+				/* Inbound RREQ error, detected during
+				 * RRESP creation. Take state from
+				 * current TX work queue element to
+				 * reconstruct peers RREQ.
+				 */
+				rreq = (struct iwarp_rdma_rreq *)err_hdr;
+
+				memcpy(&rreq->ctrl,
+				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+				       sizeof(struct iwarp_ctrl));
+
+				rreq->rsvd = 0;
+				rreq->ddp_qn =
+					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+
+				/* Provide RREQ's MSN as kept aside */
+				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
+
+				rreq->ddp_mo = htonl(wqe->processed);
+				rreq->sink_stag = htonl(wqe->sqe.rkey);
+				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
+				rreq->read_size = htonl(wqe->sqe.sge[0].length);
+				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
+				rreq->source_to =
+					cpu_to_be64(wqe->sqe.sge[0].laddr);
+
+				iov[1].iov_base = rreq;
+				iov[1].iov_len = sizeof(*rreq);
+
+				rx_hdr = (union iwarp_hdr *)rreq;
+			} else {
+				/* Take RDMAP/DDP information from
+				 * current (failed) inbound frame.
+				 */
+				iov[1].iov_base = rx_hdr;
+
+				if (__rdmap_opcode(&rx_hdr->ctrl) ==
+				    RDMAP_RDMA_READ_REQ)
+					iov[1].iov_len =
+						sizeof(struct iwarp_rdma_rreq);
+				else /* SEND type */
+					iov[1].iov_len =
+						sizeof(struct iwarp_send);
+			}
+		} else {
+			/* Do not report DDP hdr information if packet
+			 * layout is unknown
+			 */
+			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
+			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
+				break;
+
+			iov[1].iov_base = rx_hdr;
+
+			/* Only DDP frame will get attached */
+			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+				iov[1].iov_len =
+					sizeof(struct iwarp_rdma_write);
+			else
+				iov[1].iov_len = sizeof(struct iwarp_send);
+
+			term->flag_m = 1;
+			term->flag_d = 1;
+		}
+		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
+
+		break;
+
+	case TERM_ERROR_LAYER_DDP:
+		/* Report error encountered while DDP processing.
+		 * This can only happen as a result of inbound
+		 * DDP processing
+		 */
+
+		/* Do not report DDP hdr information if packet
+		 * layout is unknown
+		 */
+		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
+		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
+		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
+		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
+			break;
+
+		iov[1].iov_base = rx_hdr;
+
+		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
+		else
+			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
+
+		term->flag_m = 1;
+		term->flag_d = 1;
+
+		break;
+
+	default:
+		break;
+
+	}
+	if (term->flag_m || term->flag_d || term->flag_r) {
+		iov[2].iov_base = &crc;
+		iov[2].iov_len = sizeof(crc);
+		len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
+		num_frags = 3;
+	} else {
+		iov[1].iov_base = &crc;
+		iov[1].iov_len = sizeof(crc);
+		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
+		num_frags = 2;
+	}
+
+	/* Adjust DDP Segment Length parameter, if valid */
+	if (term->flag_m) {
+		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
+		enum rdma_opcode op = __rdmap_opcode(&rx_hdr->ctrl);
+
+		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
+		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
+	}
+
+	term->ctrl.mpa_len = cpu_to_be16(len_terminate -
+					 (MPA_HDR_SIZE + MPA_CRC_SIZE));
+	if (qp->tx_ctx.mpa_crc_hd) {
+		crypto_shash_init(rx_ctx->mpa_crc_hd);
+		if (siw_crc_array(rx_ctx->mpa_crc_hd, (u8 *)iov[0].iov_base,
+				  iov[0].iov_len))
+			goto out;
+
+		if (num_frags == 3) {
+			if (siw_crc_array(rx_ctx->mpa_crc_hd,
+					  (u8 *)iov[1].iov_base,
+					  iov[1].iov_len))
+				goto out;
+		}
+		crypto_shash_final(rx_ctx->mpa_crc_hd, (u8 *)&crc);
+	}
+
+	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
+	siw_dbg_qp(qp,
+		   "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
+		   rv == len_terminate ? "success" : "failure",
+		   __rdmap_term_layer(term), __rdmap_term_etype(term),
+		   __rdmap_term_ecode(term), rv);
+out:
+	kfree(term);
+	kfree(err_hdr);
+}
+
+/*
+ * handle all attrs other than state
+ */
+static void siw_qp_modify_nonstate(struct siw_qp *qp,
+				   struct siw_qp_attrs *attrs,
+				   enum siw_qp_attr_mask mask)
+{
+	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
+		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
+			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
+		else
+			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
+
+		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
+			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+		else
+			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+		if (attrs->flags & SIW_RDMA_READ_ENABLED)
+			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
+		else
+			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
+	}
+}
+
+/*
+ * caller holds qp->state_lock
+ */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
+		  enum siw_qp_attr_mask mask)
+{
+	int	drop_conn = 0, rv = 0;
+
+	if (!mask)
+		return 0;
+
+	siw_dbg_qp(qp, "state: %s => %s\n",
+		   siw_qp_state_to_string[qp->attrs.state],
+		   siw_qp_state_to_string[attrs->state]);
+
+	if (mask != SIW_QP_ATTR_STATE)
+		siw_qp_modify_nonstate(qp, attrs, mask);
+
+	if (!(mask & SIW_QP_ATTR_STATE))
+		return 0;
+
+	switch (qp->attrs.state) {
+
+	case SIW_QP_STATE_IDLE:
+	case SIW_QP_STATE_RTR:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_RTS:
+
+			if (attrs->flags & SIW_MPA_CRC) {
+				rv = siw_qp_enable_crc(qp);
+				if (rv)
+					break;
+			}
+			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
+				siw_dbg_qp(qp, "no socket\n");
+				rv = -EINVAL;
+				break;
+			}
+			if (!(mask & SIW_QP_ATTR_MPA)) {
+				siw_dbg_qp(qp, "no MPA\n");
+				rv = -EINVAL;
+				break;
+			}
+			siw_dbg_qp(qp, "enter rts, peer 0x%08x, loc 0x%08x\n",
+				   qp->cep->llp.raddr.sin_addr.s_addr,
+				   qp->cep->llp.laddr.sin_addr.s_addr);
+			/*
+			 * Initialize iWARP TX state
+			 */
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
+
+			/*
+			 * Initialize iWARP RX state
+			 */
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
+
+			/*
+			 * init IRD free queue, caller has already checked
+			 * limits.
+			 */
+			rv = siw_qp_readq_init(qp, attrs->irq_size,
+					       attrs->orq_size);
+			if (rv)
+				break;
+
+			qp->attrs.sk = attrs->sk;
+			qp->attrs.state = SIW_QP_STATE_RTS;
+
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+			if (qp->cep) {
+				siw_cep_put(qp->cep);
+				qp->cep = NULL;
+			}
+			break;
+
+		case SIW_QP_STATE_RTR:
+			/* ignore */
+			break;
+
+		default:
+			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
+				   siw_qp_state_to_string[qp->attrs.state],
+				   siw_qp_state_to_string[attrs->state]);
+			break;
+		}
+		break;
+
+	case SIW_QP_STATE_RTS:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_CLOSING:
+			/*
+			 * Verbs: move to IDLE if SQ and ORQ are empty.
+			 * Move to ERROR otherwise. But first of all we must
+			 * close the connection. So we keep CLOSING or ERROR
+			 * as a transient state, schedule connection drop work
+			 * and wait for the socket state change upcall to
+			 * come back closed.
+			 */
+			if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
+				qp->attrs.state = SIW_QP_STATE_CLOSING;
+			} else {
+				qp->attrs.state = SIW_QP_STATE_ERROR;
+				siw_sq_flush(qp);
+			}
+			siw_rq_flush(qp);
+
+			drop_conn = 1;
+			break;
+
+		case SIW_QP_STATE_TERMINATE:
+			qp->attrs.state = SIW_QP_STATE_TERMINATE;
+
+			siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+					   RDMAP_ETYPE_CATASTROPHIC,
+					   RDMAP_ECODE_UNSPECIFIED, 1);
+			drop_conn = 1;
+
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			/*
+			 * This is an emergency close.
+			 *
+			 * Any in progress transmit operation will get
+			 * cancelled.
+			 * This will likely result in a protocol failure,
+			 * if a TX operation is in transit. The caller
+			 * could unconditional wait to give the current
+			 * operation a chance to complete.
+			 * Esp., how to handle the non-empty IRQ case?
+			 * The peer was asking for data transfer at a valid
+			 * point in time.
+			 */
+			siw_sq_flush(qp);
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+			drop_conn = 1;
+
+			break;
+
+		default:
+			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
+				   siw_qp_state_to_string[qp->attrs.state],
+				   siw_qp_state_to_string[attrs->state]);
+			break;
+		}
+		break;
+
+	case SIW_QP_STATE_TERMINATE:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_ERROR:
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+
+			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+				siw_sq_flush(qp);
+
+			break;
+
+		default:
+			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
+				   siw_qp_state_to_string[qp->attrs.state],
+				   siw_qp_state_to_string[attrs->state]);
+		}
+		break;
+
+	case SIW_QP_STATE_CLOSING:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_IDLE:
+			WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
+			qp->attrs.state = SIW_QP_STATE_IDLE;
+
+			break;
+
+		case SIW_QP_STATE_CLOSING:
+			/*
+			 * The LLP may already moved the QP to closing
+			 * due to graceful peer close init
+			 */
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			/*
+			 * QP was moved to CLOSING by LLP event
+			 * not yet seen by user.
+			 */
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+
+			if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+				siw_sq_flush(qp);
+
+			siw_rq_flush(qp);
+
+			break;
+
+		default:
+			siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
+				   siw_qp_state_to_string[qp->attrs.state],
+				   siw_qp_state_to_string[attrs->state]);
+
+			return -ECONNABORTED;
+		}
+		break;
+
+	default:
+		siw_dbg_qp(qp, " noop: state %s\n",
+			   siw_qp_state_to_string[qp->attrs.state]);
+		break;
+	}
+	if (drop_conn)
+		siw_qp_cm_drop(qp, 0);
+
+	return rv;
+}
+
+struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
+{
+	struct siw_qp *qp =  siw_qp_id2obj(to_siw_dev(base_dev), id);
+
+	if (qp) {
+		/*
+		 * siw_qp_id2obj() increments object reference count
+		 */
+		siw_qp_put(qp);
+		siw_dbg_qp(qp, "got base QP");
+
+		return &qp->base_qp;
+	}
+	return (struct ib_qp *)NULL;
+}
+
+/*
+ * siw_check_mem()
+ *
+ * Check protection domain, STAG state, access permissions and
+ * address range for memory object.
+ *
+ * @pd:		Protection Domain memory should belong to
+ * @mem:	memory to be checked
+ * @addr:	starting addr of mem
+ * @perms:	requested access permissions
+ * @len:	len of memory interval to be checked
+ *
+ */
+int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem, u64 addr,
+		  enum siw_access_flags perms, int len)
+{
+	if (siw_mem2mr(mem)->pd != pd) {
+		siw_dbg(pd->hdr.sdev, "[PD %d]: pd mismatch\n", OBJ_ID(pd));
+		return -E_PD_MISMATCH;
+	}
+	if (!mem->stag_valid) {
+		siw_dbg(pd->hdr.sdev, "[PD %d]: stag 0x%08x invalid\n",
+			OBJ_ID(pd), OBJ_ID(mem));
+		return -E_STAG_INVALID;
+	}
+	/*
+	 * check access permissions
+	 */
+	if ((mem->perms & perms) < perms) {
+		siw_dbg(pd->hdr.sdev, "[PD %d]: permissions 0x%08x < 0x%08x\n",
+			OBJ_ID(pd), mem->perms, perms);
+		return -E_ACCESS_PERM;
+	}
+	/*
+	 * Check address interval: we relax check to allow memory shrinked
+	 * from the start address _after_ placing or fetching len bytes.
+	 * TODO: this relaxation is probably overdone
+	 */
+	if (addr < mem->va || addr + len > mem->va + mem->len) {
+		siw_dbg(pd->hdr.sdev, "[PD %d]: MEM interval len %d\n",
+			OBJ_ID(pd), len);
+		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx) out of bounds\n",
+			(unsigned long long)addr,
+			(unsigned long long)(addr + len));
+		siw_dbg(pd->hdr.sdev, "[0x%016llx, 0x%016llx] LKey=0x%08x\n",
+			(unsigned long long)mem->va,
+			(unsigned long long)(mem->va + mem->len),
+			OBJ_ID(mem));
+
+		return -E_BASE_BOUNDS;
+	}
+	return E_ACCESS_OK;
+}
+
+/*
+ * siw_check_sge()
+ *
+ * Check SGE for access rights in given interval
+ *
+ * @pd:		Protection Domain memory should belong to
+ * @sge:	SGE to be checked
+ * @mem:	array of memory references
+ * @perms:	requested access permissions
+ * @off:	starting offset in SGE
+ * @len:	len of memory interval to be checked
+ *
+ * NOTE: Function references SGE's memory object (mem->obj)
+ * if not yet done. New reference is kept if check went ok and
+ * released if check failed. If mem->obj is already valid, no new
+ * lookup is being done and mem is not released it check fails.
+ */
+int
+siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
+	      struct siw_mem *mem[], enum siw_access_flags perms,
+	      u32 off, int len)
+{
+	struct siw_device *sdev = pd->hdr.sdev;
+	int new_ref = 0, rv = E_ACCESS_OK;
+
+	if (len + off > sge->length) {
+		rv = -E_BASE_BOUNDS;
+		goto fail;
+	}
+	if (*mem == NULL) {
+		*mem = siw_mem_id2obj(sdev, sge->lkey >> 8);
+		if (*mem == NULL) {
+			rv = -E_STAG_INVALID;
+			goto fail;
+		}
+		new_ref = 1;
+	}
+
+	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
+	if (rv)
+		goto fail;
+
+	return 0;
+
+fail:
+	if (new_ref) {
+		siw_mem_put(*mem);
+		*mem = NULL;
+	}
+	return rv;
+}
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
+{
+	rreq->id = sqe->id;
+	rreq->opcode = sqe->opcode;
+	rreq->sge[0].laddr = sqe->sge[0].laddr;
+	rreq->sge[0].length = sqe->sge[0].length;
+	rreq->sge[0].lkey = sqe->sge[0].lkey;
+	rreq->sge[1].lkey = sqe->sge[1].lkey;
+	rreq->flags = sqe->flags | SIW_WQE_VALID;
+	rreq->num_sge = 1;
+}
+
+/*
+ * Must be called with SQ locked.
+ * To avoid complete SQ starvation by constant inbound READ requests,
+ * the active IRQ will not be served after qp->irq_burst, if the
+ * SQ has pending work.
+ */
+int siw_activate_tx(struct siw_qp *qp)
+{
+	struct siw_sqe	*irqe, *sqe;
+	struct siw_wqe	*wqe = tx_wqe(qp);
+	int rv = 1;
+
+	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
+
+	if (irqe->flags & SIW_WQE_VALID) {
+		sqe = sq_get_next(qp);
+
+		/*
+		 * Avoid local WQE processing starvation in case
+		 * of constant inbound READ request stream
+		 */
+		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
+			qp->irq_burst = 0;
+			goto skip_irq;
+		}
+		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+		wqe->wr_status = SIW_WR_QUEUED;
+
+		/* start READ RESPONSE */
+		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
+		wqe->sqe.flags = 0;
+		if (irqe->num_sge) {
+			wqe->sqe.num_sge = 1;
+			wqe->sqe.sge[0].length = irqe->sge[0].length;
+			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
+			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
+		} else {
+			wqe->sqe.num_sge = 0;
+		}
+
+		/* Retain original RREQ's message sequence number for
+		 * potential error reporting cases.
+		 */
+		wqe->sqe.sge[1].length = irqe->sge[1].length;
+
+		wqe->sqe.rkey = irqe->rkey;
+		wqe->sqe.raddr = irqe->raddr;
+
+		wqe->processed = 0;
+		qp->irq_get++;
+
+		/* mark current IRQ entry free */
+		smp_store_mb(irqe->flags, 0);
+
+		goto out;
+	}
+
+	sqe = sq_get_next(qp);
+	if (sqe) {
+skip_irq:
+		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+		wqe->wr_status = SIW_WR_QUEUED;
+
+		/* First copy SQE to kernel private memory */
+		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
+
+		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
+			rv = -EINVAL;
+			goto out;
+		}
+		if (wqe->sqe.flags & SIW_WQE_INLINE) {
+			if (wqe->sqe.opcode != SIW_OP_SEND &&
+			    wqe->sqe.opcode != SIW_OP_WRITE) {
+				rv = -EINVAL;
+				goto out;
+			}
+			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
+				rv = -EINVAL;
+				goto out;
+			}
+			wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
+			wqe->sqe.sge[0].lkey = 0;
+			wqe->sqe.num_sge = 1;
+		}
+		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
+			/* A READ cannot be fenced */
+			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
+			    wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV)) {
+				siw_dbg_qp(qp, "cannot fence read\n");
+				rv = -EINVAL;
+				goto out;
+			}
+			spin_lock(&qp->orq_lock);
+
+			if (!siw_orq_empty(qp)) {
+				qp->tx_ctx.orq_fence = 1;
+				rv = 0;
+			}
+			spin_unlock(&qp->orq_lock);
+
+		} else if (wqe->sqe.opcode == SIW_OP_READ ||
+			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+			struct siw_sqe	*rreq;
+
+			wqe->sqe.num_sge = 1;
+
+			spin_lock(&qp->orq_lock);
+
+			rreq = orq_get_free(qp);
+			if (rreq) {
+				/*
+				 * Make an immediate copy in ORQ to be ready
+				 * to process loopback READ reply
+				 */
+				siw_read_to_orq(rreq, &wqe->sqe);
+				qp->orq_put++;
+			} else {
+				qp->tx_ctx.orq_fence = 1;
+				rv = 0;
+			}
+			spin_unlock(&qp->orq_lock);
+		}
+
+		/* Clear SQE, can be re-used by application */
+		smp_store_mb(sqe->flags, 0);
+		qp->sq_get++;
+	} else {
+		rv = 0;
+	}
+out:
+	if (unlikely(rv < 0)) {
+		siw_dbg_qp(qp, "error %d\n", rv);
+		wqe->wr_status = SIW_WR_IDLE;
+	}
+	return rv;
+}
+
+/*
+ * Check if current CQ state qualifies for
+ * calling CQ completion handler. Must be
+ * called with CQ lock held.
+ */
+static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
+{
+	u64 cq_notify;
+
+	if (!cq->base_cq.comp_handler)
+		return false;
+
+	cq_notify = READ_ONCE(*cq->notify);
+
+	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
+	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
+	     (flags & SIW_WQE_SOLICITED))) {
+		/* dis-arm CQ */
+		smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
+
+		return true;
+	}
+	return false;
+}
+
+/* Must be called without holding CQ lock */
+static inline void siw_cq_completion(struct siw_cq *cq)
+{
+	siw_dbg_obj(cq, "Completion\n");
+	(*cq->base_cq.comp_handler)(&cq->base_cq, cq->base_cq.cq_context);
+}
+
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+		     enum siw_wc_status status)
+{
+	struct siw_cq *cq = qp->scq;
+	int rv = 0;
+
+	if (cq) {
+		u32 sqe_flags = sqe->flags;
+		struct siw_cqe *cqe;
+		u32 idx;
+		unsigned long flags;
+
+		spin_lock_irqsave(&cq->lock, flags);
+
+		idx = cq->cq_put % cq->num_cqe;
+		cqe = &cq->queue[idx];
+
+		if (!READ_ONCE(cqe->flags)) {
+			bool notify;
+
+			cqe->id = sqe->id;
+			cqe->opcode = sqe->opcode;
+			cqe->status = status;
+			cqe->imm_data = 0;
+			cqe->bytes = bytes;
+
+			if (cq->kernel_verbs) {
+				siw_qp_get(qp);
+				cqe->qp = qp;
+			} else {
+				cqe->qp_id = QP_ID(qp);
+			}
+			/* mark CQE valid for application */
+			WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
+			/* recycle SQE */
+			smp_store_mb(sqe->flags, 0);
+
+			cq->cq_put++;
+			notify = siw_cq_notify_now(cq, sqe_flags);
+
+			spin_unlock_irqrestore(&cq->lock, flags);
+
+			if (notify)
+				siw_cq_completion(cq);
+		} else {
+			spin_unlock_irqrestore(&cq->lock, flags);
+			rv = -ENOMEM;
+			siw_cq_event(cq, IB_EVENT_CQ_ERR);
+		}
+	} else {
+		/* recycle SQE */
+		smp_store_mb(sqe->flags, 0);
+	}
+	return rv;
+}
+
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+		     enum siw_wc_status status)
+{
+	struct siw_cq *cq = qp->rcq;
+	int rv = 0;
+
+	if (cq) {
+		struct siw_cqe *cqe;
+		u32 idx;
+		unsigned long flags;
+
+		spin_lock_irqsave(&cq->lock, flags);
+
+		idx = cq->cq_put % cq->num_cqe;
+		cqe = &cq->queue[idx];
+
+		if (!READ_ONCE(cqe->flags)) {
+			bool notify;
+
+			cqe->id = rqe->id;
+			cqe->opcode = SIW_OP_RECEIVE;
+			cqe->status = status;
+			cqe->imm_data = 0;
+			cqe->bytes = bytes;
+
+			if (cq->kernel_verbs) {
+				siw_qp_get(qp);
+				cqe->qp = qp;
+			} else {
+				cqe->qp_id = QP_ID(qp);
+			}
+			/* mark CQE valid for application */
+			WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
+			/* recycle RQE */
+			smp_store_mb(rqe->flags, 0);
+
+			cq->cq_put++;
+			notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
+
+			spin_unlock_irqrestore(&cq->lock, flags);
+
+			if (notify)
+				siw_cq_completion(cq);
+		} else {
+			spin_unlock_irqrestore(&cq->lock, flags);
+			rv = -ENOMEM;
+			siw_cq_event(cq, IB_EVENT_CQ_ERR);
+		}
+	} else {
+		/* recycle RQE */
+		smp_store_mb(rqe->flags, 0);
+	}
+	return rv;
+}
+
+/*
+ * siw_sq_flush()
+ *
+ * Flush SQ and ORRQ entries to CQ.
+ *
+ * TODO: Add termination code for in-progress WQE.
+ * TODO: an in-progress WQE may have been partially
+ *       processed. It should be enforced, that transmission
+ *       of a started DDP segment must be completed if possible
+ *       by any chance.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, SQ and ORQ lock must not be taken.
+ */
+void siw_sq_flush(struct siw_qp *qp)
+{
+	struct siw_sqe	*sqe;
+	struct siw_wqe	*wqe = tx_wqe(qp);
+	int		async_event = 0;
+
+	siw_dbg_qp(qp, "enter\n");
+
+	/*
+	 * Start with completing any work currently on the ORQ
+	 */
+	for (;;) {
+		if (qp->attrs.orq_size == 0)
+			break;
+
+		sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
+		if (!READ_ONCE(sqe->flags))
+			break;
+
+		if (siw_sqe_complete(qp, sqe, 0,
+				     SIW_WC_WR_FLUSH_ERR) != 0)
+			break;
+
+		qp->orq_get++;
+	}
+	/*
+	 * Flush an in-progess WQE if present
+	 */
+	if (wqe->wr_status != SIW_WR_IDLE) {
+		/*
+		 * TODO: Add iWARP Termination code
+		 */
+		siw_dbg_qp(qp, "flush current sqe, type %d, status %d\n",
+			   tx_type(wqe), wqe->wr_status);
+
+		siw_wqe_put_mem(wqe, tx_type(wqe));
+
+		if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
+			((tx_type(wqe) != SIW_OP_READ &&
+			  tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
+			wqe->wr_status == SIW_WR_QUEUED))
+			/*
+			 * An in-progress Read Request is already in
+			 * the ORQ
+			 */
+			siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+					 SIW_WC_WR_FLUSH_ERR);
+
+		wqe->wr_status = SIW_WR_IDLE;
+	}
+	/*
+	 * Flush the Send Queue
+	 */
+	while (qp->attrs.sq_size) {
+		sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+		if (!READ_ONCE(sqe->flags))
+			break;
+
+		async_event = 1;
+		if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+			/*
+			 * Shall IB_EVENT_SQ_DRAINED be supressed if work
+			 * completion fails?
+			 */
+			break;
+
+		WRITE_ONCE(sqe->flags, 0);
+		qp->sq_get++;
+	}
+	if (async_event)
+		siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
+}
+
+/*
+ * siw_rq_flush()
+ *
+ * Flush recv queue entries to CQ.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, RQ lock must not be taken.
+ */
+void siw_rq_flush(struct siw_qp *qp)
+{
+	struct siw_wqe		*wqe = rx_wqe(qp);
+
+	siw_dbg_qp(qp, "enter\n");
+
+	/*
+	 * Flush an in-progess WQE if present
+	 */
+	if (wqe->wr_status != SIW_WR_IDLE) {
+		siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
+			   rx_type(wqe), wqe->wr_status);
+
+		siw_wqe_put_mem(wqe, rx_type(wqe));
+
+		if (rx_type(wqe) == SIW_OP_RECEIVE) {
+			siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
+					 SIW_WC_WR_FLUSH_ERR);
+		} else if (rx_type(wqe) != SIW_OP_READ &&
+			   rx_type(wqe) != SIW_OP_READ_RESPONSE &&
+			   rx_type(wqe) != SIW_OP_WRITE) {
+			siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
+		}
+		wqe->wr_status = SIW_WR_IDLE;
+	}
+	/*
+	 * Flush the Receive Queue
+	 */
+	while (qp->attrs.rq_size) {
+		struct siw_rqe *rqe =
+			&qp->recvq[qp->rq_get % qp->attrs.rq_size];
+
+		if (!READ_ONCE(rqe->flags))
+			break;
+
+		if (siw_rqe_complete(qp, rqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+			break;
+
+		WRITE_ONCE(rqe->flags, 0);
+		qp->rq_get++;
+	}
+}