diff mbox series

[for-next,v11,08/11] RDMA/erdma: Add connection management (CM) support

Message ID 20220615015227.65686-9-chengyou@linux.alibaba.com (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series Elastic RDMA Adapter (ERDMA) driver | expand

Commit Message

Cheng Xu June 15, 2022, 1:52 a.m. UTC
ERDMA's transport protocol is iWarp, so the driver must support CM
interface. In CM part, we use the same way as SoftiWarp: using kernel
socket to set up the connection, then performing MPA negotiation in
kernel. So, this part of code mainly comes from SoftiWarp, base on it,
we add some more features, such as non-blocking iw_connect implementation.

This commit also fixes a duplicated include issue reported by Abaci Robot.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
---
 drivers/infiniband/hw/erdma/erdma_cm.c | 1430 ++++++++++++++++++++++++
 drivers/infiniband/hw/erdma/erdma_cm.h |  167 +++
 2 files changed, 1597 insertions(+)
 create mode 100644 drivers/infiniband/hw/erdma/erdma_cm.c
 create mode 100644 drivers/infiniband/hw/erdma/erdma_cm.h

Comments

Weihang Li June 16, 2022, 2:59 a.m. UTC | #1
On 2022/6/15 9:52, Cheng Xu wrote:
> ERDMA's transport protocol is iWarp, so the driver must support CM
> interface. In CM part, we use the same way as SoftiWarp: using kernel
> socket to set up the connection, then performing MPA negotiation in
> kernel. So, this part of code mainly comes from SoftiWarp, base on it,
> we add some more features, such as non-blocking iw_connect implementation.
> 
> This commit also fixes a duplicated include issue reported by Abaci Robot.
> 
> Reported-by: Abaci Robot <abaci@linux.alibaba.com>
> Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
> Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
> ---
>   drivers/infiniband/hw/erdma/erdma_cm.c | 1430 ++++++++++++++++++++++++
>   drivers/infiniband/hw/erdma/erdma_cm.h |  167 +++
>   2 files changed, 1597 insertions(+)
>   create mode 100644 drivers/infiniband/hw/erdma/erdma_cm.c
>   create mode 100644 drivers/infiniband/hw/erdma/erdma_cm.h
> 
> diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c
> new file mode 100644
> index 000000000000..8b74a173dab7
> --- /dev/null
> +++ b/drivers/infiniband/hw/erdma/erdma_cm.c
> @@ -0,0 +1,1430 @@
> +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
> +
> +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
> +/*          Kai Shen <kaishen@linux.alibaba.com> */
> +/* Copyright (c) 2020-2022, Alibaba Group. */
> +
> +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
> +/*          Fredy Neeser */
> +/*          Greg Joyce <greg@opengridcomputing.com> */
> +/* Copyright (c) 2008-2019, IBM Corporation */
> +/* Copyright (c) 2017, Open Grid Computing, Inc. */
> +
> +#include <linux/errno.h>
> +#include <linux/inetdevice.h>
> +#include <linux/net.h>
> +#include <linux/types.h>
> +#include <linux/workqueue.h>
> +#include <net/addrconf.h>
> +
> +#include <rdma/ib_user_verbs.h>
> +#include <rdma/ib_verbs.h>
> +
> +#include "erdma.h"
> +#include "erdma_cm.h"
> +#include "erdma_verbs.h"
> +
> +static struct workqueue_struct *erdma_cm_wq;
> +
> +static void erdma_cm_llp_state_change(struct sock *sk);
> +static void erdma_cm_llp_data_ready(struct sock *sk);
> +static void erdma_cm_llp_error_report(struct sock *sk);
> +
> +static void erdma_sk_assign_cm_upcalls(struct sock *sk)
> +{
> +	write_lock_bh(&sk->sk_callback_lock);
> +	sk->sk_state_change = erdma_cm_llp_state_change;
> +	sk->sk_data_ready = erdma_cm_llp_data_ready;
> +	sk->sk_error_report = erdma_cm_llp_error_report;
> +	write_unlock_bh(&sk->sk_callback_lock);
> +}
> +
> +static void erdma_sk_save_upcalls(struct sock *sk)
> +{
> +	struct erdma_cep *cep = sk_to_cep(sk);
> +
> +	write_lock_bh(&sk->sk_callback_lock);
> +	cep->sk_state_change = sk->sk_state_change;
> +	cep->sk_data_ready = sk->sk_data_ready;
> +	cep->sk_error_report = sk->sk_error_report;
> +	write_unlock_bh(&sk->sk_callback_lock);
> +}
> +
> +static void erdma_sk_restore_upcalls(struct sock *sk, struct erdma_cep *cep)
> +{
> +	sk->sk_state_change = cep->sk_state_change;
> +	sk->sk_data_ready = cep->sk_data_ready;
> +	sk->sk_error_report = cep->sk_error_report;
> +	sk->sk_user_data = NULL;
> +}
> +
> +static void erdma_socket_disassoc(struct socket *s)
> +{
> +	struct sock *sk = s->sk;
> +	struct erdma_cep *cep;
> +
> +	if (sk) {
> +		write_lock_bh(&sk->sk_callback_lock);
> +		cep = sk_to_cep(sk);
> +		if (cep) {
> +			erdma_sk_restore_upcalls(sk, cep);
> +			erdma_cep_put(cep);
> +		} else {
> +			WARN_ON_ONCE(1);
> +		}
> +		write_unlock_bh(&sk->sk_callback_lock);
> +	} else {
> +		WARN_ON_ONCE(1);
> +	}
> +}
> +
> +static void erdma_cep_socket_assoc(struct erdma_cep *cep, struct socket *s)
> +{
> +	cep->sock = s;
> +	erdma_cep_get(cep);
> +	s->sk->sk_user_data = cep;
> +
> +	erdma_sk_save_upcalls(s->sk);
> +	erdma_sk_assign_cm_upcalls(s->sk);
> +}
> +
> +static void erdma_disassoc_listen_cep(struct erdma_cep *cep)
> +{
> +	if (cep->listen_cep) {
> +		erdma_cep_put(cep->listen_cep);
> +		cep->listen_cep = NULL;
> +	}
> +}
> +
> +static struct erdma_cep *erdma_cep_alloc(struct erdma_dev *dev)
> +{
> +	struct erdma_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
> +	unsigned long flags;
> +
> +	if (!cep)
> +		return NULL;
> +
> +	INIT_LIST_HEAD(&cep->listenq);
> +	INIT_LIST_HEAD(&cep->devq);
> +	INIT_LIST_HEAD(&cep->work_freelist);
> +
> +	kref_init(&cep->ref);
> +	cep->state = ERDMA_EPSTATE_IDLE;
> +	init_waitqueue_head(&cep->waitq);
> +	spin_lock_init(&cep->lock);
> +	cep->dev = dev;
> +
> +	spin_lock_irqsave(&dev->lock, flags);
> +	list_add_tail(&cep->devq, &dev->cep_list);
> +	spin_unlock_irqrestore(&dev->lock, flags);
> +
> +	return cep;
> +}
> +
> +static void erdma_cm_free_work(struct erdma_cep *cep)
> +{
> +	struct list_head *w, *tmp;
> +	struct erdma_cm_work *work;
> +
> +	list_for_each_safe(w, tmp, &cep->work_freelist) {
> +		work = list_entry(w, struct erdma_cm_work, list);
> +		list_del(&work->list);
> +		kfree(work);
> +	}
> +}
> +
> +static void erdma_cancel_mpatimer(struct erdma_cep *cep)
> +{
> +	spin_lock_bh(&cep->lock);
> +	if (cep->mpa_timer) {
> +		if (cancel_delayed_work(&cep->mpa_timer->work)) {
> +			erdma_cep_put(cep);
> +			kfree(cep->mpa_timer);
> +		}
> +		cep->mpa_timer = NULL;
> +	}
> +	spin_unlock_bh(&cep->lock);
> +}
> +
> +static void erdma_put_work(struct erdma_cm_work *work)
> +{
> +	INIT_LIST_HEAD(&work->list);
> +	spin_lock_bh(&work->cep->lock);
> +	list_add(&work->list, &work->cep->work_freelist);
> +	spin_unlock_bh(&work->cep->lock);
> +}
> +
> +static void erdma_cep_set_inuse(struct erdma_cep *cep)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cep->lock, flags);
> +	while (cep->in_use) {
> +		spin_unlock_irqrestore(&cep->lock, flags);
> +		wait_event_interruptible(cep->waitq, !cep->in_use);
> +		if (signal_pending(current))
> +			flush_signals(current);
> +
> +		spin_lock_irqsave(&cep->lock, flags);
> +	}
> +
> +	cep->in_use = 1;
> +	spin_unlock_irqrestore(&cep->lock, flags);
> +}
> +
> +static void erdma_cep_set_free(struct erdma_cep *cep)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cep->lock, flags);
> +	cep->in_use = 0;
> +	spin_unlock_irqrestore(&cep->lock, flags);
> +
> +	wake_up(&cep->waitq);
> +}
> +
> +static void __erdma_cep_dealloc(struct kref *ref)
> +{
> +	struct erdma_cep *cep = container_of(ref, struct erdma_cep, ref);
> +	struct erdma_dev *dev = cep->dev;
> +	unsigned long flags;
> +
> +	WARN_ON(cep->listen_cep);
> +
> +	kfree(cep->private_data);
> +	kfree(cep->mpa.pdata);
> +	spin_lock_bh(&cep->lock);
> +	if (!list_empty(&cep->work_freelist))
> +		erdma_cm_free_work(cep);
> +	spin_unlock_bh(&cep->lock);
> +
> +	spin_lock_irqsave(&dev->lock, flags);
> +	list_del(&cep->devq);
> +	spin_unlock_irqrestore(&dev->lock, flags);
> +	kfree(cep);
> +}
> +
> +static struct erdma_cm_work *erdma_get_work(struct erdma_cep *cep)
> +{
> +	struct erdma_cm_work *work = NULL;
> +
> +	spin_lock_bh(&cep->lock);
> +	if (!list_empty(&cep->work_freelist)) {
> +		work = list_entry(cep->work_freelist.next, struct erdma_cm_work,
> +				  list);
> +		list_del_init(&work->list);
> +	}
> +
> +	spin_unlock_bh(&cep->lock);
> +	return work;
> +}
> +
> +static int erdma_cm_alloc_work(struct erdma_cep *cep, int num)
> +{
> +	struct erdma_cm_work *work;
> +
> +	while (num--) {
> +		work = kmalloc(sizeof(*work), GFP_KERNEL);
> +		if (!work) {
> +			if (!(list_empty(&cep->work_freelist)))
> +				erdma_cm_free_work(cep);
> +			return -ENOMEM;
> +		}
> +		work->cep = cep;
> +		INIT_LIST_HEAD(&work->list);
> +		list_add(&work->list, &cep->work_freelist);
> +	}
> +
> +	return 0;
> +}
> +
> +static int erdma_cm_upcall(struct erdma_cep *cep, enum iw_cm_event_type reason,
> +			   int status)
> +{
> +	struct iw_cm_event event;
> +	struct iw_cm_id *cm_id;
> +
> +	memset(&event, 0, sizeof(event));
> +	event.status = status;
> +	event.event = reason;
> +
> +	if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
> +		event.provider_data = cep;
> +		cm_id = cep->listen_cep->cm_id;
> +
> +		event.ird = cep->dev->attrs.max_ird;
> +		event.ord = cep->dev->attrs.max_ord;
> +	} else {
> +		cm_id = cep->cm_id;
> +	}
> +
> +	if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
> +	    reason == IW_CM_EVENT_CONNECT_REPLY) {
> +		u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
> +
> +		if (pd_len && cep->mpa.pdata) {
> +			event.private_data_len = pd_len;
> +			event.private_data = cep->mpa.pdata;
> +		}
> +
> +		getname_local(cep->sock, &event.local_addr);
> +		getname_peer(cep->sock, &event.remote_addr);
> +	}
> +
> +	return cm_id->event_handler(cm_id, &event);
> +}
> +
> +void erdma_qp_cm_drop(struct erdma_qp *qp)
> +{
> +	struct erdma_cep *cep = qp->cep;
> +
> +	if (!qp->cep)
> +		return;
> +
> +	erdma_cep_set_inuse(cep);
> +
> +	/* already closed. */
> +	if (cep->state == ERDMA_EPSTATE_CLOSED)
> +		goto out;
> +
> +	if (cep->cm_id) {
> +		switch (cep->state) {
> +		case ERDMA_EPSTATE_AWAIT_MPAREP:
> +			erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
> +					-EINVAL);
> +			break;
> +
> +		case ERDMA_EPSTATE_RDMA_MODE:
> +			erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
> +			break;
> +
> +		case ERDMA_EPSTATE_IDLE:
> +		case ERDMA_EPSTATE_LISTENING:
> +		case ERDMA_EPSTATE_CONNECTING:
> +		case ERDMA_EPSTATE_AWAIT_MPAREQ:
> +		case ERDMA_EPSTATE_RECVD_MPAREQ:
> +		case ERDMA_EPSTATE_CLOSED:
> +		default:
> +			break;
> +		}
> +		cep->cm_id->rem_ref(cep->cm_id);
> +		cep->cm_id = NULL;
> +		erdma_cep_put(cep);
> +	}
> +	cep->state = ERDMA_EPSTATE_CLOSED;
> +
> +	if (cep->sock) {
> +		erdma_socket_disassoc(cep->sock);
> +		sock_release(cep->sock);
> +		cep->sock = NULL;
> +	}
> +
> +	if (cep->qp) {
> +		cep->qp = NULL;
> +		erdma_qp_put(qp);
> +	}
> +out:
> +	erdma_cep_set_free(cep);
> +}
> +
> +void erdma_cep_put(struct erdma_cep *cep)
> +{
> +	WARN_ON(kref_read(&cep->ref) < 1);
> +	kref_put(&cep->ref, __erdma_cep_dealloc);
> +}
> +
> +void erdma_cep_get(struct erdma_cep *cep)
> +{
> +	kref_get(&cep->ref);
> +}
> +
> +static int erdma_send_mpareqrep(struct erdma_cep *cep, const void *pdata,
> +				u8 pd_len)
> +{
> +	struct socket *s = cep->sock;
> +	struct mpa_rr *rr = &cep->mpa.hdr;
> +	struct kvec iov[3];
> +	struct msghdr msg;
> +	int iovec_num = 0;
> +	int ret;
> +	int mpa_len;
> +
> +	memset(&msg, 0, sizeof(msg));
> +
> +	rr->params.pd_len = cpu_to_be16(pd_len);
> +
> +	iov[iovec_num].iov_base = rr;
> +	iov[iovec_num].iov_len = sizeof(*rr);
> +	iovec_num++;
> +	mpa_len = sizeof(*rr);
> +
> +	iov[iovec_num].iov_base = &cep->mpa.ext_data;
> +	iov[iovec_num].iov_len = sizeof(cep->mpa.ext_data);
> +	iovec_num++;
> +	mpa_len += sizeof(cep->mpa.ext_data);
> +
> +	if (pd_len) {
> +		iov[iovec_num].iov_base = (char *)pdata;
> +		iov[iovec_num].iov_len = pd_len;
> +		mpa_len += pd_len;
> +		iovec_num++;
> +	}
> +
> +	ret = kernel_sendmsg(s, &msg, iov, iovec_num, mpa_len);
> +
> +	return ret < 0 ? ret : 0;
> +}
> +
> +static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
> +			     int flags)
> +{
> +	struct kvec iov = { buf, size };
> +	struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
> +
> +	return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
> +}
> +
> +static int __recv_mpa_hdr(struct erdma_cep *cep, int hdr_rcvd, char *hdr,
> +			  int hdr_size, int *rcvd_out)
> +{
> +	struct socket *s = cep->sock;
> +	int rcvd;
> +
> +	*rcvd_out = 0;
> +	if (hdr_rcvd < hdr_size) {
> +		rcvd = ksock_recv(s, hdr + hdr_rcvd, hdr_size - hdr_rcvd,
> +				  MSG_DONTWAIT);
> +		if (rcvd == -EAGAIN)
> +			return -EAGAIN;
> +
> +		if (rcvd <= 0)
> +			return -ECONNABORTED;
> +
> +		hdr_rcvd += rcvd;
> +		*rcvd_out = rcvd;
> +
> +		if (hdr_rcvd < hdr_size)
> +			return -EAGAIN;
> +	}
> +
> +	return 0;
> +}
> +
> +static void __mpa_rr_set_revision(__be16 *bits, u8 rev)
> +{
> +	*bits = (*bits & ~MPA_RR_MASK_REVISION) |
> +		(cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
> +}
> +
> +static u8 __mpa_rr_revision(__be16 mpa_rr_bits)
> +{
> +	__be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
> +
> +	return (u8)be16_to_cpu(rev);
> +}
> +
> +static void __mpa_ext_set_cc(__be32 *bits, u32 cc)
> +{
> +	*bits = (*bits & ~MPA_EXT_FLAG_CC) |
> +		(cpu_to_be32(cc) & MPA_EXT_FLAG_CC);
> +}
> +
> +static u8 __mpa_ext_cc(__be32 mpa_ext_bits)
> +{
> +	__be32 cc = mpa_ext_bits & MPA_EXT_FLAG_CC;
> +
> +	return (u8)be32_to_cpu(cc);
> +}
> +
> +/*
> + * Receive MPA Request/Reply header.
> + *
> + * Returns 0 if complete MPA Request/Reply haeder including
> + * eventual private data was received. Returns -EAGAIN if
> + * header was partially received or negative error code otherwise.
> + *
> + * Context: May be called in process context only
> + */
> +static int erdma_recv_mpa_rr(struct erdma_cep *cep)
> +{
> +	struct mpa_rr *hdr = &cep->mpa.hdr;
> +	struct socket *s = cep->sock;
> +	u16 pd_len;
> +	int rcvd, to_rcv, ret, pd_rcvd;
> +
> +	if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
> +		ret = __recv_mpa_hdr(cep, cep->mpa.bytes_rcvd,
> +				     (char *)&cep->mpa.hdr,
> +				     sizeof(struct mpa_rr), &rcvd);
> +		cep->mpa.bytes_rcvd += rcvd;
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA ||
> +	    __mpa_rr_revision(hdr->params.bits) != MPA_REVISION_EXT_1)
> +		return -EPROTO;
> +
> +	if (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) <
> +	    sizeof(struct erdma_mpa_ext)) {
> +		ret = __recv_mpa_hdr(
> +			cep, cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
> +			(char *)&cep->mpa.ext_data,
> +			sizeof(struct erdma_mpa_ext), &rcvd);
> +		cep->mpa.bytes_rcvd += rcvd;
> +		if (ret)
> +			return ret;
> +	}
> +
> +	pd_len = be16_to_cpu(hdr->params.pd_len);
> +	pd_rcvd = cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) -
> +		  sizeof(struct erdma_mpa_ext);
> +	to_rcv = pd_len - pd_rcvd;
> +
> +	if (!to_rcv) {
> +		/*
> +		 * We have received the whole MPA Request/Reply message.
> +		 * Check against peer protocol violation.
> +		 */
> +		u32 word;
> +
> +		ret = __recv_mpa_hdr(cep, 0, (char *)&word, sizeof(word),
> +				     &rcvd);
> +		if (ret == -EAGAIN && rcvd == 0)
> +			return 0;
> +
> +		if (ret)
> +			return ret;
> +
> +		return -EPROTO;
> +	}
> +
> +	/*
> +	 * At this point, MPA header has been fully received, and pd_len != 0.
> +	 * So, begin to receive private data.
> +	 */
> +	if (!cep->mpa.pdata) {
> +		cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
> +		if (!cep->mpa.pdata)
> +			return -ENOMEM;
> +	}
> +
> +	rcvd = ksock_recv(s, cep->mpa.pdata + pd_rcvd, to_rcv + 4,
> +			  MSG_DONTWAIT);
> +	if (rcvd < 0)
> +		return rcvd;
> +
> +	if (rcvd > to_rcv)
> +		return -EPROTO;
> +
> +	cep->mpa.bytes_rcvd += rcvd;
> +
> +	if (to_rcv == rcvd)
> +		return 0;
> +
> +	return -EAGAIN;
> +}
> +
> +/*
> + * erdma_proc_mpareq()
> + *
> + * Read MPA Request from socket and signal new connection to IWCM
> + * if success. Caller must hold lock on corresponding listening CEP.
> + */
> +static int erdma_proc_mpareq(struct erdma_cep *cep)
> +{
> +	struct mpa_rr *req;
> +	int ret;
> +
> +	ret = erdma_recv_mpa_rr(cep);
> +	if (ret)
> +		return ret;
> +
> +	req = &cep->mpa.hdr;
> +
> +	if (memcmp(req->key, MPA_KEY_REQ, MPA_KEY_SIZE))
> +		return -EPROTO;
> +
> +	memcpy(req->key, MPA_KEY_REP, MPA_KEY_SIZE);
> +
> +	/* Currently does not support marker and crc. */
> +	if (req->params.bits & MPA_RR_FLAG_MARKERS ||
> +	    req->params.bits & MPA_RR_FLAG_CRC)
> +		goto reject_conn;
> +
> +	cep->state = ERDMA_EPSTATE_RECVD_MPAREQ;
> +
> +	/* Keep reference until IWCM accepts/rejects */
> +	erdma_cep_get(cep);
> +	ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
> +	if (ret)
> +		erdma_cep_put(cep);
> +
> +	return ret;
> +
> +reject_conn:
> +	req->params.bits &= ~MPA_RR_FLAG_MARKERS;
> +	req->params.bits |= MPA_RR_FLAG_REJECT;
> +	req->params.bits &= ~MPA_RR_FLAG_CRC;
> +
> +	kfree(cep->mpa.pdata);
> +	cep->mpa.pdata = NULL;
> +	erdma_send_mpareqrep(cep, NULL, 0);
> +
> +	return -EOPNOTSUPP;
> +}
> +
> +static int erdma_proc_mpareply(struct erdma_cep *cep)
> +{
> +	struct erdma_qp_attrs qp_attrs;
> +	struct erdma_qp *qp = cep->qp;
> +	struct mpa_rr *rep;
> +	int ret;
> +
> +	ret = erdma_recv_mpa_rr(cep);
> +	if (ret != -EAGAIN)
> +		erdma_cancel_mpatimer(cep);
> +	if (ret)
> +		goto out_err;
> +
> +	rep = &cep->mpa.hdr;
> +
> +	if (memcmp(rep->key, MPA_KEY_REP, MPA_KEY_SIZE)) {
> +		ret = -EPROTO;
> +		goto out_err;
> +	}
> +
> +	if (rep->params.bits & MPA_RR_FLAG_REJECT) {
> +		erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
> +		return -ECONNRESET;
> +	}
> +
> +	/* Currently does not support marker and crc. */
> +	if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
> +	    (rep->params.bits & MPA_RR_FLAG_CRC)) {
> +		erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
> +		return -EINVAL;
> +	}
> +
> +	memset(&qp_attrs, 0, sizeof(qp_attrs));
> +	qp_attrs.irq_size = cep->ird;
> +	qp_attrs.orq_size = cep->ord;
> +	qp_attrs.state = ERDMA_QP_STATE_RTS;
> +
> +	down_write(&qp->state_lock);
> +	if (qp->attrs.state > ERDMA_QP_STATE_RTR) {
> +		ret = -EINVAL;
> +		up_write(&qp->state_lock);
> +		goto out_err;
> +	}
> +
> +	qp->attrs.qp_type = ERDMA_QP_ACTIVE;
> +	if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc)
> +		qp->attrs.cc = COMPROMISE_CC;
> +
> +	ret = erdma_modify_qp_internal(qp, &qp_attrs,
> +				       ERDMA_QP_ATTR_STATE |
> +				       ERDMA_QP_ATTR_LLP_HANDLE |
> +				       ERDMA_QP_ATTR_MPA);
> +
> +	up_write(&qp->state_lock);
> +
> +	if (!ret) {
> +		ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
> +		if (!ret)
> +			cep->state = ERDMA_EPSTATE_RDMA_MODE;
> +
> +		return 0;
> +	}
> +
> +out_err:
> +	erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
> +	return ret;
> +}
> +
> +static void erdma_accept_newconn(struct erdma_cep *cep)
> +{
> +	struct socket *s = cep->sock;
> +	struct socket *new_s = NULL;
> +	struct erdma_cep *new_cep = NULL;
> +	int ret = 0;
> +
> +	if (cep->state != ERDMA_EPSTATE_LISTENING)
> +		goto error;
> +
> +	new_cep = erdma_cep_alloc(cep->dev);
> +	if (!new_cep)
> +		goto error;
> +
> +	/*
> +	 * 4: Allocate a sufficient number of work elements
> +	 * to allow concurrent handling of local + peer close
> +	 * events, MPA header processing + MPA timeout.
> +	 */
> +	if (erdma_cm_alloc_work(new_cep, 4) != 0)
> +		goto error;
> +
> +	/*
> +	 * Copy saved socket callbacks from listening CEP
> +	 * and assign new socket with new CEP
> +	 */
> +	new_cep->sk_state_change = cep->sk_state_change;
> +	new_cep->sk_data_ready = cep->sk_data_ready;
> +	new_cep->sk_error_report = cep->sk_error_report;
> +
> +	ret = kernel_accept(s, &new_s, O_NONBLOCK);
> +	if (ret != 0)
> +		goto error;
> +
> +	new_cep->sock = new_s;
> +	erdma_cep_get(new_cep);
> +	new_s->sk->sk_user_data = new_cep;
> +
> +	tcp_sock_set_nodelay(new_s->sk);
> +	new_cep->state = ERDMA_EPSTATE_AWAIT_MPAREQ;
> +
> +	ret = erdma_cm_queue_work(new_cep, ERDMA_CM_WORK_MPATIMEOUT);
> +	if (ret)
> +		goto error;
> +
> +	new_cep->listen_cep = cep;
> +	erdma_cep_get(cep);
> +
> +	if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
> +		/* MPA REQ already queued */
> +		erdma_cep_set_inuse(new_cep);
> +		ret = erdma_proc_mpareq(new_cep);
> +		if (ret != -EAGAIN) {
> +			erdma_cep_put(cep);
> +			new_cep->listen_cep = NULL;
> +			if (ret) {
> +				erdma_cep_set_free(new_cep);
> +				goto error;
> +			}
> +		}
> +		erdma_cep_set_free(new_cep);
> +	}
> +	return;
> +
> +error:
> +	if (new_cep) {
> +		new_cep->state = ERDMA_EPSTATE_CLOSED;
> +		erdma_cancel_mpatimer(new_cep);
> +
> +		erdma_cep_put(new_cep);
> +		new_cep->sock = NULL;
> +	}
> +
> +	if (new_s) {
> +		erdma_socket_disassoc(new_s);
> +		sock_release(new_s);
> +	}
> +}
> +
> +static int erdma_newconn_connected(struct erdma_cep *cep)
> +{
> +	int ret = 0;
> +
> +	cep->mpa.hdr.params.bits = 0;
> +	__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1);
> +
> +	memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE);
> +	cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie);
> +	__mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc);
> +
> +	ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len);
> +	cep->state = ERDMA_EPSTATE_AWAIT_MPAREP;
> +	cep->mpa.hdr.params.pd_len = 0;
> +
> +	if (ret >= 0)
> +		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_MPATIMEOUT);
> +
> +	return ret;
> +}
> +
> +static void erdma_cm_work_handler(struct work_struct *w)
> +{
> +	struct erdma_cm_work *work;
> +	struct erdma_cep *cep;
> +	int release_cep = 0, ret = 0;
> +
> +	work = container_of(w, struct erdma_cm_work, work.work);
> +	cep = work->cep;
> +
> +	erdma_cep_set_inuse(cep);
> +
> +	switch (work->type) {
> +	case ERDMA_CM_WORK_CONNECTED:
> +		erdma_cancel_mpatimer(cep);
> +		if (cep->state == ERDMA_EPSTATE_CONNECTING) {
> +			ret = erdma_newconn_connected(cep);
> +			if (ret) {
> +				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
> +						-EIO);
> +				release_cep = 1;
> +			}
> +		}
> +		break;
> +	case ERDMA_CM_WORK_CONNECTTIMEOUT:
> +		if (cep->state == ERDMA_EPSTATE_CONNECTING) {
> +			cep->mpa_timer = NULL;
> +			erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
> +					-ETIMEDOUT);
> +			release_cep = 1;
> +		}
> +		break;
> +	case ERDMA_CM_WORK_ACCEPT:
> +		erdma_accept_newconn(cep);
> +		break;
> +	case ERDMA_CM_WORK_READ_MPAHDR:
> +		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
> +			if (cep->listen_cep) {
> +				erdma_cep_set_inuse(cep->listen_cep);
> +
> +				if (cep->listen_cep->state ==
> +				    ERDMA_EPSTATE_LISTENING)
> +					ret = erdma_proc_mpareq(cep);
> +				else
> +					ret = -EFAULT;
> +
> +				erdma_cep_set_free(cep->listen_cep);
> +
> +				if (ret != -EAGAIN) {
> +					erdma_cep_put(cep->listen_cep);
> +					cep->listen_cep = NULL;
> +					if (ret)
> +						erdma_cep_put(cep);
> +				}
> +			}
> +		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
> +			ret = erdma_proc_mpareply(cep);
> +		}
> +
> +		if (ret && ret != -EAGAIN)
> +			release_cep = 1;
> +		break;
> +	case ERDMA_CM_WORK_CLOSE_LLP:
> +		if (cep->cm_id)
> +			erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
> +		release_cep = 1;
> +		break;
> +	case ERDMA_CM_WORK_PEER_CLOSE:
> +		if (cep->cm_id) {
> +			if (cep->state == ERDMA_EPSTATE_CONNECTING ||
> +			    cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
> +				/*
> +				 * MPA reply not received, but connection drop
> +				 */
> +				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
> +						-ECONNRESET);
> +			} else if (cep->state == ERDMA_EPSTATE_RDMA_MODE) {
> +				/*
> +				 * NOTE: IW_CM_EVENT_DISCONNECT is given just
> +				 *       to transition IWCM into CLOSING.
> +				 */
> +				erdma_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
> +				erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
> +			}
> +		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
> +			/* Socket close before MPA request received. */
> +			erdma_disassoc_listen_cep(cep);
> +			erdma_cep_put(cep);
> +		}
> +		release_cep = 1;
> +		break;
> +	case ERDMA_CM_WORK_MPATIMEOUT:
> +		cep->mpa_timer = NULL;
> +		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
> +			/*
> +			 * MPA request timed out:
> +			 * Hide any partially received private data and signal
> +			 * timeout
> +			 */
> +			cep->mpa.hdr.params.pd_len = 0;
> +
> +			if (cep->cm_id)
> +				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
> +						-ETIMEDOUT);
> +			release_cep = 1;
> +		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
> +			/* No MPA req received after peer TCP stream setup. */
> +			erdma_disassoc_listen_cep(cep);
> +
> +			erdma_cep_put(cep);
> +			release_cep = 1;
> +		}
> +		break;
> +	default:
> +		WARN(1, "Undefined CM work type: %d\n", work->type);
> +	}
> +
> +	if (release_cep) {
> +		erdma_cancel_mpatimer(cep);
> +		cep->state = ERDMA_EPSTATE_CLOSED;
> +		if (cep->qp) {
> +			struct erdma_qp *qp = cep->qp;
> +			/*
> +			 * Serialize a potential race with application
> +			 * closing the QP and calling erdma_qp_cm_drop()
> +			 */
> +			erdma_qp_get(qp);
> +			erdma_cep_set_free(cep);
> +
> +			erdma_qp_llp_close(qp);
> +			erdma_qp_put(qp);
> +
> +			erdma_cep_set_inuse(cep);
> +			cep->qp = NULL;
> +			erdma_qp_put(qp);
> +		}
> +
> +		if (cep->sock) {
> +			erdma_socket_disassoc(cep->sock);
> +			sock_release(cep->sock);
> +			cep->sock = NULL;
> +		}
> +
> +		if (cep->cm_id) {
> +			cep->cm_id->rem_ref(cep->cm_id);
> +			cep->cm_id = NULL;
> +			if (cep->state != ERDMA_EPSTATE_LISTENING)
> +				erdma_cep_put(cep);
> +		}
> +	}
> +	erdma_cep_set_free(cep);
> +	erdma_put_work(work);
> +	erdma_cep_put(cep);
> +}
> +
> +int erdma_cm_queue_work(struct erdma_cep *cep, enum erdma_work_type type)
> +{
> +	struct erdma_cm_work *work = erdma_get_work(cep);
> +	unsigned long delay = 0;
> +
> +	if (!work)
> +		return -ENOMEM;
> +
> +	work->type = type;
> +	work->cep = cep;
> +
> +	erdma_cep_get(cep);
> +
> +	INIT_DELAYED_WORK(&work->work, erdma_cm_work_handler);
> +
> +	if (type == ERDMA_CM_WORK_MPATIMEOUT) {
> +		cep->mpa_timer = work;
> +
> +		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP)
> +			delay = MPAREP_TIMEOUT;
> +		else
> +			delay = MPAREQ_TIMEOUT;
> +	} else if (type == ERDMA_CM_WORK_CONNECTTIMEOUT) {
> +		cep->mpa_timer = work;
> +
> +		delay = CONNECT_TIMEOUT;
> +	}
> +
> +	queue_delayed_work(erdma_cm_wq, &work->work, delay);
> +
> +	return 0;
> +}
> +
> +static void erdma_cm_llp_data_ready(struct sock *sk)
> +{
> +	struct erdma_cep *cep;
> +
> +	read_lock(&sk->sk_callback_lock);
> +
> +	cep = sk_to_cep(sk);
> +	if (!cep)
> +		goto out;
> +
> +	if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ ||
> +	    cep->state == ERDMA_EPSTATE_AWAIT_MPAREP)
> +		erdma_cm_queue_work(cep, ERDMA_CM_WORK_READ_MPAHDR);
> +
> +out:
> +	read_unlock(&sk->sk_callback_lock);
> +}
> +
> +static void erdma_cm_llp_error_report(struct sock *sk)
> +{
> +	struct erdma_cep *cep = sk_to_cep(sk);
> +
> +	if (cep)
> +		cep->sk_error_report(sk);
> +}
> +
> +static void erdma_cm_llp_state_change(struct sock *sk)
> +{
> +	struct erdma_cep *cep;
> +	void (*orig_state_change)(struct sock *sk);
> +
> +	read_lock(&sk->sk_callback_lock);
> +
> +	cep = sk_to_cep(sk);
> +	if (!cep) {
> +		read_unlock(&sk->sk_callback_lock);
> +		return;
> +	}
> +	orig_state_change = cep->sk_state_change;
> +
> +	switch (sk->sk_state) {
> +	case TCP_ESTABLISHED:
> +		if (cep->state == ERDMA_EPSTATE_CONNECTING)
> +			erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED);
> +		else
> +			erdma_cm_queue_work(cep, ERDMA_CM_WORK_ACCEPT);
> +		break;
> +	case TCP_CLOSE:
> +	case TCP_CLOSE_WAIT:
> +		if (cep->state != ERDMA_EPSTATE_LISTENING)
> +			erdma_cm_queue_work(cep, ERDMA_CM_WORK_PEER_CLOSE);
> +		break;
> +	default:
> +		break;
> +	}
> +	read_unlock(&sk->sk_callback_lock);
> +	orig_state_change(sk);
> +}
> +
> +static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
> +			      int laddrlen, struct sockaddr *raddr,
> +			      int raddrlen, int flags)
> +{
> +	int ret;
> +
> +	sock_set_reuseaddr(s->sk);
> +	ret = s->ops->bind(s, laddr, laddrlen);
> +	if (ret)
> +		return ret;
> +	ret = s->ops->connect(s, raddr, raddrlen, flags);
> +	return ret < 0 ? ret : 0;
> +}
> +
> +int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
> +{
> +	struct erdma_dev *dev = to_edev(id->device);
> +	struct erdma_qp *qp;
> +	struct erdma_cep *cep = NULL;
> +	struct socket *s = NULL;
> +	struct sockaddr *laddr = (struct sockaddr *)&id->m_local_addr;
> +	struct sockaddr *raddr = (struct sockaddr *)&id->m_remote_addr;
> +	u16 pd_len = params->private_data_len;
> +	int ret;
> +
> +	if (pd_len > MPA_MAX_PRIVDATA)
> +		return -EINVAL;
> +
> +	if (params->ird > dev->attrs.max_ird ||
> +	    params->ord > dev->attrs.max_ord)
> +		return -EINVAL;
> +
> +	if (laddr->sa_family != AF_INET || raddr->sa_family != AF_INET)
> +		return -EAFNOSUPPORT;
> +
> +	qp = find_qp_by_qpn(dev, params->qpn);
> +	if (!qp)
> +		return -ENOENT;
> +	erdma_qp_get(qp);
> +
> +	ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s);
> +	if (ret < 0)
> +		goto error_put_qp;
> +
> +	cep = erdma_cep_alloc(dev);
> +	if (!cep) {
> +		ret = -ENOMEM;
> +		goto error_release_sock;
> +	}
> +
> +	erdma_cep_set_inuse(cep);
> +
> +	/* Associate QP with CEP */
> +	erdma_cep_get(cep);
> +	qp->cep = cep;
> +	cep->qp = qp;
> +
> +	/* Associate cm_id with CEP */
> +	id->add_ref(id);
> +	cep->cm_id = id;
> +
> +	/*
> +	 * 6: Allocate a sufficient number of work elements
> +	 * to allow concurrent handling of local + peer close
> +	 * events, MPA header processing + MPA timeout, connected event
> +	 * and connect timeout.
> +	 */
> +	ret = erdma_cm_alloc_work(cep, 6);
> +	if (ret != 0) {
> +		ret = -ENOMEM;
> +		goto error_release_cep;
> +	}
> +
> +	cep->ird = params->ird;
> +	cep->ord = params->ord;
> +	cep->state = ERDMA_EPSTATE_CONNECTING;
> +
> +	erdma_cep_socket_assoc(cep, s);
> +
> +	if (pd_len) {
> +		cep->pd_len = pd_len;
> +		cep->private_data = kmalloc(pd_len, GFP_KERNEL);
> +		if (!cep->private_data) {
> +			ret = -ENOMEM;
> +			goto error_disassoc;
> +		}
> +
> +		memcpy(cep->private_data, params->private_data,
> +		       params->private_data_len);
> +	}
> +
> +	ret = kernel_bindconnect(s, laddr, sizeof(*laddr), raddr,
> +				 sizeof(*raddr), O_NONBLOCK);
> +	if (ret != -EINPROGRESS && ret != 0) {
> +		goto error_disassoc;
> +	} else if (ret == 0) {
> +		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED);
> +		if (ret)
> +			goto error_disassoc;
> +	} else {
> +		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTTIMEOUT);
> +		if (ret)
> +			goto error_disassoc;
> +	}
> +
> +	erdma_cep_set_free(cep);
> +	return 0;
> +
> +error_disassoc:
> +	kfree(cep->private_data);
> +	cep->private_data = NULL;
> +	cep->pd_len = 0;
> +
> +	erdma_socket_disassoc(s);
> +
> +error_release_cep:
> +	/* disassoc with cm_id */
> +	cep->cm_id = NULL;
> +	id->rem_ref(id);
> +
> +	/* disassoc with qp */
> +	qp->cep = NULL;
> +	erdma_cep_put(cep);
> +	cep->qp = NULL;
> +
> +	cep->state = ERDMA_EPSTATE_CLOSED;
> +
> +	erdma_cep_set_free(cep);
> +
> +	/* release the cep. */
> +	erdma_cep_put(cep);
> +
> +error_release_sock:
> +	if (s)
> +		sock_release(s);
> +error_put_qp:
> +	erdma_qp_put(qp);
> +
> +	return ret;
> +}
> +
> +int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
> +{
> +	struct erdma_dev *dev = to_edev(id->device);
> +	struct erdma_cep *cep = (struct erdma_cep *)id->provider_data;
> +	struct erdma_qp *qp;
> +	struct erdma_qp_attrs qp_attrs;
> +	int ret;
> +
> +	erdma_cep_set_inuse(cep);
> +	erdma_cep_put(cep);
> +
> +	/* Free lingering inbound private data */
> +	if (cep->mpa.hdr.params.pd_len) {
> +		cep->mpa.hdr.params.pd_len = 0;
> +		kfree(cep->mpa.pdata);
> +		cep->mpa.pdata = NULL;
> +	}
> +	erdma_cancel_mpatimer(cep);
> +
> +	if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) {
> +		erdma_cep_set_free(cep);
> +		erdma_cep_put(cep);
> +
> +		return -ECONNRESET;
> +	}
> +
> +	qp = find_qp_by_qpn(dev, params->qpn);
> +	if (!qp)
> +		return -ENOENT;
> +	erdma_qp_get(qp);
> +
> +	down_write(&qp->state_lock);
> +	if (qp->attrs.state > ERDMA_QP_STATE_RTR) {
> +		ret = -EINVAL;
> +		up_write(&qp->state_lock);
> +		goto error;
> +	}
> +
> +	if (params->ord > dev->attrs.max_ord ||
> +	    params->ird > dev->attrs.max_ord) {
> +		ret = -EINVAL;
> +		up_write(&qp->state_lock);
> +		goto error;
> +	}
> +
> +	if (params->private_data_len > MPA_MAX_PRIVDATA) {
> +		ret = -EINVAL;
> +		up_write(&qp->state_lock);
> +		goto error;
> +	}
> +
> +	cep->ird = params->ird;
> +	cep->ord = params->ord;
> +
> +	cep->cm_id = id;
> +	id->add_ref(id);
> +
> +	memset(&qp_attrs, 0, sizeof(qp_attrs));
> +	qp_attrs.orq_size = params->ord;
> +	qp_attrs.irq_size = params->ird;
> +
> +	qp_attrs.state = ERDMA_QP_STATE_RTS;
> +
> +	/* Associate QP with CEP */
> +	erdma_cep_get(cep);
> +	qp->cep = cep;
> +	cep->qp = qp;
> +
> +	cep->state = ERDMA_EPSTATE_RDMA_MODE;
> +
> +	qp->attrs.qp_type = ERDMA_QP_PASSIVE;
> +	qp->attrs.pd_len = params->private_data_len;
> +
> +	if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits))
> +		qp->attrs.cc = COMPROMISE_CC;
> +
> +	/* move to rts */
> +	ret = erdma_modify_qp_internal(qp, &qp_attrs,
> +				       ERDMA_QP_ATTR_STATE |
> +				       ERDMA_QP_ATTR_ORD |
> +				       ERDMA_QP_ATTR_LLP_HANDLE |
> +				       ERDMA_QP_ATTR_IRD |
> +				       ERDMA_QP_ATTR_MPA);
> +	up_write(&qp->state_lock);
> +
> +	if (ret)
> +		goto error;
> +
> +	cep->mpa.ext_data.bits = 0;
> +	__mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc);
> +	cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie);
> +
> +	ret = erdma_send_mpareqrep(cep, params->private_data,
> +				   params->private_data_len);

Hi, Cheng Xu

After modify qp to RTS, is that means this TCP stream is offloaded
to the hardware, and the inbound TCP segments related to the TCP stream
will not be received in the kernel stack where this driver is running.
If yes, the ACK of mpa reply will never been revieced by the stack?

Thanks
Weihang

> +	if (!ret) {
> +		ret = erdma_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
> +		if (ret)
> +			goto error;
> +
> +		erdma_cep_set_free(cep);
> +
> +		return 0;
> +	}
> +
> +error:
> +	erdma_socket_disassoc(cep->sock);
> +	sock_release(cep->sock);
> +	cep->sock = NULL;
> +
> +	cep->state = ERDMA_EPSTATE_CLOSED;
> +
> +	if (cep->cm_id) {
> +		cep->cm_id->rem_ref(id);
> +		cep->cm_id = NULL;
> +	}
> +
> +	if (qp->cep) {
> +		erdma_cep_put(cep);
> +		qp->cep = NULL;
> +	}
> +
> +	cep->qp = NULL;
> +	erdma_qp_put(qp);
> +
> +	erdma_cep_set_free(cep);
> +	erdma_cep_put(cep);
> +
> +	return ret;
> +}
> +
> +int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen)
> +{
> +	struct erdma_cep *cep = (struct erdma_cep *)id->provider_data;
> +
> +	erdma_cep_set_inuse(cep);
> +	erdma_cep_put(cep);
> +
> +	erdma_cancel_mpatimer(cep);
> +
> +	if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) {
> +		erdma_cep_set_free(cep);
> +		erdma_cep_put(cep);
> +
> +		return -ECONNRESET;
> +	}
> +
> +	if (__mpa_rr_revision(cep->mpa.hdr.params.bits) == MPA_REVISION_EXT_1) {
> +		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
> +		erdma_send_mpareqrep(cep, pdata, plen);
> +	}
> +
> +	erdma_socket_disassoc(cep->sock);
> +	sock_release(cep->sock);
> +	cep->sock = NULL;
> +
> +	cep->state = ERDMA_EPSTATE_CLOSED;
> +
> +	erdma_cep_set_free(cep);
> +	erdma_cep_put(cep);
> +
> +	return 0;
> +}
> +
> +int erdma_create_listen(struct iw_cm_id *id, int backlog)
> +{
> +	struct socket *s;
> +	struct erdma_cep *cep = NULL;
> +	int ret = 0;
> +	struct erdma_dev *dev = to_edev(id->device);
> +	int addr_family = id->local_addr.ss_family;
> +	struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
> +
> +	if (addr_family != AF_INET)
> +		return -EAFNOSUPPORT;
> +
> +	ret = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
> +	if (ret < 0)
> +		return ret;
> +
> +	sock_set_reuseaddr(s->sk);
> +
> +	/* For wildcard addr, limit binding to current device only */
> +	if (ipv4_is_zeronet(laddr->sin_addr.s_addr))
> +		s->sk->sk_bound_dev_if = dev->netdev->ifindex;
> +
> +	ret = s->ops->bind(s, (struct sockaddr *)laddr,
> +			   sizeof(struct sockaddr_in));
> +	if (ret)
> +		goto error;
> +
> +	cep = erdma_cep_alloc(dev);
> +	if (!cep) {
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +	erdma_cep_socket_assoc(cep, s);
> +
> +	ret = erdma_cm_alloc_work(cep, backlog);
> +	if (ret)
> +		goto error;
> +
> +	ret = s->ops->listen(s, backlog);
> +	if (ret)
> +		goto error;
> +
> +	cep->cm_id = id;
> +	id->add_ref(id);
> +
> +	if (!id->provider_data) {
> +		id->provider_data =
> +			kmalloc(sizeof(struct list_head), GFP_KERNEL);
> +		if (!id->provider_data) {
> +			ret = -ENOMEM;
> +			goto error;
> +		}
> +		INIT_LIST_HEAD((struct list_head *)id->provider_data);
> +	}
> +
> +	list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
> +	cep->state = ERDMA_EPSTATE_LISTENING;
> +
> +	return 0;
> +
> +error:
> +	if (cep) {
> +		erdma_cep_set_inuse(cep);
> +
> +		if (cep->cm_id) {
> +			cep->cm_id->rem_ref(cep->cm_id);
> +			cep->cm_id = NULL;
> +		}
> +		cep->sock = NULL;
> +		erdma_socket_disassoc(s);
> +		cep->state = ERDMA_EPSTATE_CLOSED;
> +
> +		erdma_cep_set_free(cep);
> +		erdma_cep_put(cep);
> +	}
> +	sock_release(s);
> +
> +	return ret;
> +}
> +
> +static void erdma_drop_listeners(struct iw_cm_id *id)
> +{
> +	struct list_head *p, *tmp;
> +	/*
> +	 * In case of a wildcard rdma_listen on a multi-homed device,
> +	 * a listener's IWCM id is associated with more than one listening CEP.
> +	 */
> +	list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
> +		struct erdma_cep *cep =
> +			list_entry(p, struct erdma_cep, listenq);
> +
> +		list_del(p);
> +
> +		erdma_cep_set_inuse(cep);
> +
> +		if (cep->cm_id) {
> +			cep->cm_id->rem_ref(cep->cm_id);
> +			cep->cm_id = NULL;
> +		}
> +		if (cep->sock) {
> +			erdma_socket_disassoc(cep->sock);
> +			sock_release(cep->sock);
> +			cep->sock = NULL;
> +		}
> +		cep->state = ERDMA_EPSTATE_CLOSED;
> +		erdma_cep_set_free(cep);
> +		erdma_cep_put(cep);
> +	}
> +}
> +
> +int erdma_destroy_listen(struct iw_cm_id *id)
> +{
> +	if (!id->provider_data)
> +		return 0;
> +
> +	erdma_drop_listeners(id);
> +	kfree(id->provider_data);
> +	id->provider_data = NULL;
> +
> +	return 0;
> +}
> +
> +int erdma_cm_init(void)
> +{
> +	erdma_cm_wq = create_singlethread_workqueue("erdma_cm_wq");
> +	if (!erdma_cm_wq)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +void erdma_cm_exit(void)
> +{
> +	if (erdma_cm_wq)
> +		destroy_workqueue(erdma_cm_wq);
> +}
> diff --git a/drivers/infiniband/hw/erdma/erdma_cm.h b/drivers/infiniband/hw/erdma/erdma_cm.h
> new file mode 100644
> index 000000000000..040c05dd3625
> --- /dev/null
> +++ b/drivers/infiniband/hw/erdma/erdma_cm.h
> @@ -0,0 +1,167 @@
> +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
> +
> +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
> +/*          Kai Shen <kaishen@linux.alibaba.com> */
> +/* Copyright (c) 2020-2022, Alibaba Group. */
> +
> +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
> +/*          Greg Joyce <greg@opengridcomputing.com> */
> +/* Copyright (c) 2008-2019, IBM Corporation */
> +/* Copyright (c) 2017, Open Grid Computing, Inc. */
> +
> +#ifndef __ERDMA_CM_H__
> +#define __ERDMA_CM_H__
> +
> +#include <linux/tcp.h>
> +#include <net/sock.h>
> +#include <rdma/iw_cm.h>
> +
> +/* iWarp MPA protocol defs */
> +#define MPA_REVISION_EXT_1 129
> +#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
> +#define MPA_KEY_REQ "MPA ID Req Frame"
> +#define MPA_KEY_REP "MPA ID Rep Frame"
> +#define MPA_KEY_SIZE 16
> +#define MPA_DEFAULT_HDR_LEN 28
> +
> +struct mpa_rr_params {
> +	__be16 bits;
> +	__be16 pd_len;
> +};
> +
> +/*
> + * MPA request/response Hdr bits & fields
> + */
> +enum {
> +	MPA_RR_FLAG_MARKERS = __cpu_to_be16(0x8000),
> +	MPA_RR_FLAG_CRC = __cpu_to_be16(0x4000),
> +	MPA_RR_FLAG_REJECT = __cpu_to_be16(0x2000),
> +	MPA_RR_RESERVED = __cpu_to_be16(0x1f00),
> +	MPA_RR_MASK_REVISION = __cpu_to_be16(0x00ff)
> +};
> +
> +/*
> + * MPA request/reply header
> + */
> +struct mpa_rr {
> +	u8 key[16];
> +	struct mpa_rr_params params;
> +};
> +
> +struct erdma_mpa_ext {
> +	__be32 cookie;
> +	__be32 bits;
> +};
> +
> +enum {
> +	MPA_EXT_FLAG_CC = cpu_to_be32(0x0000000f),
> +};
> +
> +struct erdma_mpa_info {
> +	struct mpa_rr hdr; /* peer mpa hdr in host byte order */
> +	struct erdma_mpa_ext ext_data;
> +	char *pdata;
> +	int bytes_rcvd;
> +};
> +
> +struct erdma_sk_upcalls {
> +	void (*sk_state_change)(struct sock *sk);
> +	void (*sk_data_ready)(struct sock *sk, int bytes);
> +	void (*sk_error_report)(struct sock *sk);
> +};
> +
> +struct erdma_dev;
> +
> +enum erdma_cep_state {
> +	ERDMA_EPSTATE_IDLE = 1,
> +	ERDMA_EPSTATE_LISTENING,
> +	ERDMA_EPSTATE_CONNECTING,
> +	ERDMA_EPSTATE_AWAIT_MPAREQ,
> +	ERDMA_EPSTATE_RECVD_MPAREQ,
> +	ERDMA_EPSTATE_AWAIT_MPAREP,
> +	ERDMA_EPSTATE_RDMA_MODE,
> +	ERDMA_EPSTATE_CLOSED
> +};
> +
> +struct erdma_cep {
> +	struct iw_cm_id *cm_id;
> +	struct erdma_dev *dev;
> +	struct list_head devq;
> +	spinlock_t lock;
> +	struct kref ref;
> +	int in_use;
> +	wait_queue_head_t waitq;
> +	enum erdma_cep_state state;
> +
> +	struct list_head listenq;
> +	struct erdma_cep *listen_cep;
> +
> +	struct erdma_qp *qp;
> +	struct socket *sock;
> +
> +	struct erdma_cm_work *mpa_timer;
> +	struct list_head work_freelist;
> +
> +	struct erdma_mpa_info mpa;
> +	int ord;
> +	int ird;
> +
> +	int pd_len;
> +	/* hold user's private data. */
> +	void *private_data;
> +
> +	/* Saved upcalls of socket llp.sock */
> +	void (*sk_state_change)(struct sock *sk);
> +	void (*sk_data_ready)(struct sock *sk);
> +	void (*sk_error_report)(struct sock *sk);
> +};
> +
> +#define MPAREQ_TIMEOUT (HZ * 20)
> +#define MPAREP_TIMEOUT (HZ * 10)
> +#define CONNECT_TIMEOUT (HZ * 10)
> +
> +enum erdma_work_type {
> +	ERDMA_CM_WORK_ACCEPT = 1,
> +	ERDMA_CM_WORK_READ_MPAHDR,
> +	ERDMA_CM_WORK_CLOSE_LLP, /* close socket */
> +	ERDMA_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
> +	ERDMA_CM_WORK_MPATIMEOUT,
> +	ERDMA_CM_WORK_CONNECTED,
> +	ERDMA_CM_WORK_CONNECTTIMEOUT
> +};
> +
> +struct erdma_cm_work {
> +	struct delayed_work work;
> +	struct list_head list;
> +	enum erdma_work_type type;
> +	struct erdma_cep *cep;
> +};
> +
> +#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
> +
> +static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
> +{
> +	return s->ops->getname(s, (struct sockaddr *)a, 1);
> +}
> +
> +static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
> +{
> +	return s->ops->getname(s, (struct sockaddr *)a, 0);
> +}
> +
> +int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *param);
> +int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
> +int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen);
> +int erdma_create_listen(struct iw_cm_id *id, int backlog);
> +int erdma_destroy_listen(struct iw_cm_id *id);
> +
> +void erdma_cep_get(struct erdma_cep *ceq);
> +void erdma_cep_put(struct erdma_cep *ceq);
> +int erdma_cm_queue_work(struct erdma_cep *ceq, enum erdma_work_type type);
> +
> +int erdma_cm_init(void);
> +void erdma_cm_exit(void);
> +
> +#define sk_to_cep(sk) ((struct erdma_cep *)((sk)->sk_user_data))
> +
> +#endif
Cheng Xu June 16, 2022, 3:35 a.m. UTC | #2
On 6/16/22 10:59 AM, Weihang Li wrote:
> 

<...>
> 
> Hi, Cheng Xu
> 
> After modify qp to RTS, is that means this TCP stream is offloaded
> to the hardware, and the inbound TCP segments related to the TCP stream
> will not be received in the kernel stack where this driver is running.
> If yes, the ACK of mpa reply will never been revieced by the stack?

Actually, this depends on our infrastructure. ERDMA collaborates with
OVS module, which indeed decides the direction of packets. It makes sure
that packet from VM will be sent to VM, and packet from RDMA will be
sent to RDMA engine. So, the ACK from VM will always be sent to VM.

Thanks,
Cheng Xu
Weihang Li June 16, 2022, 6:04 a.m. UTC | #3
On 2022/6/16 11:35, Cheng Xu wrote:
> 
> 
> On 6/16/22 10:59 AM, Weihang Li wrote:
>>
> 
> <...>
>>
>> Hi, Cheng Xu
>>
>> After modify qp to RTS, is that means this TCP stream is offloaded
>> to the hardware, and the inbound TCP segments related to the TCP stream
>> will not be received in the kernel stack where this driver is running.
>> If yes, the ACK of mpa reply will never been revieced by the stack?
> 
> Actually, this depends on our infrastructure. ERDMA collaborates with
> OVS module, which indeed decides the direction of packets. It makes sure
> that packet from VM will be sent to VM, and packet from RDMA will be
> sent to RDMA engine. So, the ACK from VM will always be sent to VM.
> 
> Thanks,
> Cheng Xu
> 
> 

OK, thanks for your explanation :)

Weihang
Stefan Metzmacher June 16, 2022, 9:11 a.m. UTC | #4
Am 15.06.22 um 03:52 schrieb Cheng Xu:
> ERDMA's transport protocol is iWarp, so the driver must support CM
> interface. In CM part, we use the same way as SoftiWarp: using kernel
> socket to set up the connection, then performing MPA negotiation in
> kernel. So, this part of code mainly comes from SoftiWarp, base on it,
> we add some more features, such as non-blocking iw_connect implementation.

It would be great to have common parts to be unified in the long run
otherwise fixes are only applied to one version.

metze
Cheng Xu June 16, 2022, 9:51 a.m. UTC | #5
On 6/16/22 5:11 PM, Stefan Metzmacher wrote:
> 
> Am 15.06.22 um 03:52 schrieb Cheng Xu:
>> ERDMA's transport protocol is iWarp, so the driver must support CM
>> interface. In CM part, we use the same way as SoftiWarp: using kernel
>> socket to set up the connection, then performing MPA negotiation in
>> kernel. So, this part of code mainly comes from SoftiWarp, base on it,
>> we add some more features, such as non-blocking iw_connect 
>> implementation.
> 
> It would be great to have common parts to be unified in the long run
> otherwise fixes are only applied to one version.
> 

Yeah, it's looks good. But for now I think it is better to keep the
respective implementations. Because erdma and siw have different
purposes: siw aims to provides a standard iWarp implementation in
kernel, while erdma is a RDMA adapter in our cloud, needing fit our
hardware and the special network environment. There are already some
differences, and I'm not sure there will be more or not in the future.
We may consider it in the future.

Thanks your advice.
Cheng Xu
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c
new file mode 100644
index 000000000000..8b74a173dab7
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_cm.c
@@ -0,0 +1,1430 @@ 
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/*          Fredy Neeser */
+/*          Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#include <linux/errno.h>
+#include <linux/inetdevice.h>
+#include <linux/net.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+
+#include "erdma.h"
+#include "erdma_cm.h"
+#include "erdma_verbs.h"
+
+static struct workqueue_struct *erdma_cm_wq;
+
+static void erdma_cm_llp_state_change(struct sock *sk);
+static void erdma_cm_llp_data_ready(struct sock *sk);
+static void erdma_cm_llp_error_report(struct sock *sk);
+
+static void erdma_sk_assign_cm_upcalls(struct sock *sk)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_state_change = erdma_cm_llp_state_change;
+	sk->sk_data_ready = erdma_cm_llp_data_ready;
+	sk->sk_error_report = erdma_cm_llp_error_report;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void erdma_sk_save_upcalls(struct sock *sk)
+{
+	struct erdma_cep *cep = sk_to_cep(sk);
+
+	write_lock_bh(&sk->sk_callback_lock);
+	cep->sk_state_change = sk->sk_state_change;
+	cep->sk_data_ready = sk->sk_data_ready;
+	cep->sk_error_report = sk->sk_error_report;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void erdma_sk_restore_upcalls(struct sock *sk, struct erdma_cep *cep)
+{
+	sk->sk_state_change = cep->sk_state_change;
+	sk->sk_data_ready = cep->sk_data_ready;
+	sk->sk_error_report = cep->sk_error_report;
+	sk->sk_user_data = NULL;
+}
+
+static void erdma_socket_disassoc(struct socket *s)
+{
+	struct sock *sk = s->sk;
+	struct erdma_cep *cep;
+
+	if (sk) {
+		write_lock_bh(&sk->sk_callback_lock);
+		cep = sk_to_cep(sk);
+		if (cep) {
+			erdma_sk_restore_upcalls(sk, cep);
+			erdma_cep_put(cep);
+		} else {
+			WARN_ON_ONCE(1);
+		}
+		write_unlock_bh(&sk->sk_callback_lock);
+	} else {
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void erdma_cep_socket_assoc(struct erdma_cep *cep, struct socket *s)
+{
+	cep->sock = s;
+	erdma_cep_get(cep);
+	s->sk->sk_user_data = cep;
+
+	erdma_sk_save_upcalls(s->sk);
+	erdma_sk_assign_cm_upcalls(s->sk);
+}
+
+static void erdma_disassoc_listen_cep(struct erdma_cep *cep)
+{
+	if (cep->listen_cep) {
+		erdma_cep_put(cep->listen_cep);
+		cep->listen_cep = NULL;
+	}
+}
+
+static struct erdma_cep *erdma_cep_alloc(struct erdma_dev *dev)
+{
+	struct erdma_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
+	unsigned long flags;
+
+	if (!cep)
+		return NULL;
+
+	INIT_LIST_HEAD(&cep->listenq);
+	INIT_LIST_HEAD(&cep->devq);
+	INIT_LIST_HEAD(&cep->work_freelist);
+
+	kref_init(&cep->ref);
+	cep->state = ERDMA_EPSTATE_IDLE;
+	init_waitqueue_head(&cep->waitq);
+	spin_lock_init(&cep->lock);
+	cep->dev = dev;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	list_add_tail(&cep->devq, &dev->cep_list);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	return cep;
+}
+
+static void erdma_cm_free_work(struct erdma_cep *cep)
+{
+	struct list_head *w, *tmp;
+	struct erdma_cm_work *work;
+
+	list_for_each_safe(w, tmp, &cep->work_freelist) {
+		work = list_entry(w, struct erdma_cm_work, list);
+		list_del(&work->list);
+		kfree(work);
+	}
+}
+
+static void erdma_cancel_mpatimer(struct erdma_cep *cep)
+{
+	spin_lock_bh(&cep->lock);
+	if (cep->mpa_timer) {
+		if (cancel_delayed_work(&cep->mpa_timer->work)) {
+			erdma_cep_put(cep);
+			kfree(cep->mpa_timer);
+		}
+		cep->mpa_timer = NULL;
+	}
+	spin_unlock_bh(&cep->lock);
+}
+
+static void erdma_put_work(struct erdma_cm_work *work)
+{
+	INIT_LIST_HEAD(&work->list);
+	spin_lock_bh(&work->cep->lock);
+	list_add(&work->list, &work->cep->work_freelist);
+	spin_unlock_bh(&work->cep->lock);
+}
+
+static void erdma_cep_set_inuse(struct erdma_cep *cep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cep->lock, flags);
+	while (cep->in_use) {
+		spin_unlock_irqrestore(&cep->lock, flags);
+		wait_event_interruptible(cep->waitq, !cep->in_use);
+		if (signal_pending(current))
+			flush_signals(current);
+
+		spin_lock_irqsave(&cep->lock, flags);
+	}
+
+	cep->in_use = 1;
+	spin_unlock_irqrestore(&cep->lock, flags);
+}
+
+static void erdma_cep_set_free(struct erdma_cep *cep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cep->lock, flags);
+	cep->in_use = 0;
+	spin_unlock_irqrestore(&cep->lock, flags);
+
+	wake_up(&cep->waitq);
+}
+
+static void __erdma_cep_dealloc(struct kref *ref)
+{
+	struct erdma_cep *cep = container_of(ref, struct erdma_cep, ref);
+	struct erdma_dev *dev = cep->dev;
+	unsigned long flags;
+
+	WARN_ON(cep->listen_cep);
+
+	kfree(cep->private_data);
+	kfree(cep->mpa.pdata);
+	spin_lock_bh(&cep->lock);
+	if (!list_empty(&cep->work_freelist))
+		erdma_cm_free_work(cep);
+	spin_unlock_bh(&cep->lock);
+
+	spin_lock_irqsave(&dev->lock, flags);
+	list_del(&cep->devq);
+	spin_unlock_irqrestore(&dev->lock, flags);
+	kfree(cep);
+}
+
+static struct erdma_cm_work *erdma_get_work(struct erdma_cep *cep)
+{
+	struct erdma_cm_work *work = NULL;
+
+	spin_lock_bh(&cep->lock);
+	if (!list_empty(&cep->work_freelist)) {
+		work = list_entry(cep->work_freelist.next, struct erdma_cm_work,
+				  list);
+		list_del_init(&work->list);
+	}
+
+	spin_unlock_bh(&cep->lock);
+	return work;
+}
+
+static int erdma_cm_alloc_work(struct erdma_cep *cep, int num)
+{
+	struct erdma_cm_work *work;
+
+	while (num--) {
+		work = kmalloc(sizeof(*work), GFP_KERNEL);
+		if (!work) {
+			if (!(list_empty(&cep->work_freelist)))
+				erdma_cm_free_work(cep);
+			return -ENOMEM;
+		}
+		work->cep = cep;
+		INIT_LIST_HEAD(&work->list);
+		list_add(&work->list, &cep->work_freelist);
+	}
+
+	return 0;
+}
+
+static int erdma_cm_upcall(struct erdma_cep *cep, enum iw_cm_event_type reason,
+			   int status)
+{
+	struct iw_cm_event event;
+	struct iw_cm_id *cm_id;
+
+	memset(&event, 0, sizeof(event));
+	event.status = status;
+	event.event = reason;
+
+	if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+		event.provider_data = cep;
+		cm_id = cep->listen_cep->cm_id;
+
+		event.ird = cep->dev->attrs.max_ird;
+		event.ord = cep->dev->attrs.max_ord;
+	} else {
+		cm_id = cep->cm_id;
+	}
+
+	if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
+	    reason == IW_CM_EVENT_CONNECT_REPLY) {
+		u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
+
+		if (pd_len && cep->mpa.pdata) {
+			event.private_data_len = pd_len;
+			event.private_data = cep->mpa.pdata;
+		}
+
+		getname_local(cep->sock, &event.local_addr);
+		getname_peer(cep->sock, &event.remote_addr);
+	}
+
+	return cm_id->event_handler(cm_id, &event);
+}
+
+void erdma_qp_cm_drop(struct erdma_qp *qp)
+{
+	struct erdma_cep *cep = qp->cep;
+
+	if (!qp->cep)
+		return;
+
+	erdma_cep_set_inuse(cep);
+
+	/* already closed. */
+	if (cep->state == ERDMA_EPSTATE_CLOSED)
+		goto out;
+
+	if (cep->cm_id) {
+		switch (cep->state) {
+		case ERDMA_EPSTATE_AWAIT_MPAREP:
+			erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+					-EINVAL);
+			break;
+
+		case ERDMA_EPSTATE_RDMA_MODE:
+			erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+			break;
+
+		case ERDMA_EPSTATE_IDLE:
+		case ERDMA_EPSTATE_LISTENING:
+		case ERDMA_EPSTATE_CONNECTING:
+		case ERDMA_EPSTATE_AWAIT_MPAREQ:
+		case ERDMA_EPSTATE_RECVD_MPAREQ:
+		case ERDMA_EPSTATE_CLOSED:
+		default:
+			break;
+		}
+		cep->cm_id->rem_ref(cep->cm_id);
+		cep->cm_id = NULL;
+		erdma_cep_put(cep);
+	}
+	cep->state = ERDMA_EPSTATE_CLOSED;
+
+	if (cep->sock) {
+		erdma_socket_disassoc(cep->sock);
+		sock_release(cep->sock);
+		cep->sock = NULL;
+	}
+
+	if (cep->qp) {
+		cep->qp = NULL;
+		erdma_qp_put(qp);
+	}
+out:
+	erdma_cep_set_free(cep);
+}
+
+void erdma_cep_put(struct erdma_cep *cep)
+{
+	WARN_ON(kref_read(&cep->ref) < 1);
+	kref_put(&cep->ref, __erdma_cep_dealloc);
+}
+
+void erdma_cep_get(struct erdma_cep *cep)
+{
+	kref_get(&cep->ref);
+}
+
+static int erdma_send_mpareqrep(struct erdma_cep *cep, const void *pdata,
+				u8 pd_len)
+{
+	struct socket *s = cep->sock;
+	struct mpa_rr *rr = &cep->mpa.hdr;
+	struct kvec iov[3];
+	struct msghdr msg;
+	int iovec_num = 0;
+	int ret;
+	int mpa_len;
+
+	memset(&msg, 0, sizeof(msg));
+
+	rr->params.pd_len = cpu_to_be16(pd_len);
+
+	iov[iovec_num].iov_base = rr;
+	iov[iovec_num].iov_len = sizeof(*rr);
+	iovec_num++;
+	mpa_len = sizeof(*rr);
+
+	iov[iovec_num].iov_base = &cep->mpa.ext_data;
+	iov[iovec_num].iov_len = sizeof(cep->mpa.ext_data);
+	iovec_num++;
+	mpa_len += sizeof(cep->mpa.ext_data);
+
+	if (pd_len) {
+		iov[iovec_num].iov_base = (char *)pdata;
+		iov[iovec_num].iov_len = pd_len;
+		mpa_len += pd_len;
+		iovec_num++;
+	}
+
+	ret = kernel_sendmsg(s, &msg, iov, iovec_num, mpa_len);
+
+	return ret < 0 ? ret : 0;
+}
+
+static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
+			     int flags)
+{
+	struct kvec iov = { buf, size };
+	struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+static int __recv_mpa_hdr(struct erdma_cep *cep, int hdr_rcvd, char *hdr,
+			  int hdr_size, int *rcvd_out)
+{
+	struct socket *s = cep->sock;
+	int rcvd;
+
+	*rcvd_out = 0;
+	if (hdr_rcvd < hdr_size) {
+		rcvd = ksock_recv(s, hdr + hdr_rcvd, hdr_size - hdr_rcvd,
+				  MSG_DONTWAIT);
+		if (rcvd == -EAGAIN)
+			return -EAGAIN;
+
+		if (rcvd <= 0)
+			return -ECONNABORTED;
+
+		hdr_rcvd += rcvd;
+		*rcvd_out = rcvd;
+
+		if (hdr_rcvd < hdr_size)
+			return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void __mpa_rr_set_revision(__be16 *bits, u8 rev)
+{
+	*bits = (*bits & ~MPA_RR_MASK_REVISION) |
+		(cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
+}
+
+static u8 __mpa_rr_revision(__be16 mpa_rr_bits)
+{
+	__be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
+
+	return (u8)be16_to_cpu(rev);
+}
+
+static void __mpa_ext_set_cc(__be32 *bits, u32 cc)
+{
+	*bits = (*bits & ~MPA_EXT_FLAG_CC) |
+		(cpu_to_be32(cc) & MPA_EXT_FLAG_CC);
+}
+
+static u8 __mpa_ext_cc(__be32 mpa_ext_bits)
+{
+	__be32 cc = mpa_ext_bits & MPA_EXT_FLAG_CC;
+
+	return (u8)be32_to_cpu(cc);
+}
+
+/*
+ * Receive MPA Request/Reply header.
+ *
+ * Returns 0 if complete MPA Request/Reply haeder including
+ * eventual private data was received. Returns -EAGAIN if
+ * header was partially received or negative error code otherwise.
+ *
+ * Context: May be called in process context only
+ */
+static int erdma_recv_mpa_rr(struct erdma_cep *cep)
+{
+	struct mpa_rr *hdr = &cep->mpa.hdr;
+	struct socket *s = cep->sock;
+	u16 pd_len;
+	int rcvd, to_rcv, ret, pd_rcvd;
+
+	if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
+		ret = __recv_mpa_hdr(cep, cep->mpa.bytes_rcvd,
+				     (char *)&cep->mpa.hdr,
+				     sizeof(struct mpa_rr), &rcvd);
+		cep->mpa.bytes_rcvd += rcvd;
+		if (ret)
+			return ret;
+	}
+
+	if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA ||
+	    __mpa_rr_revision(hdr->params.bits) != MPA_REVISION_EXT_1)
+		return -EPROTO;
+
+	if (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) <
+	    sizeof(struct erdma_mpa_ext)) {
+		ret = __recv_mpa_hdr(
+			cep, cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
+			(char *)&cep->mpa.ext_data,
+			sizeof(struct erdma_mpa_ext), &rcvd);
+		cep->mpa.bytes_rcvd += rcvd;
+		if (ret)
+			return ret;
+	}
+
+	pd_len = be16_to_cpu(hdr->params.pd_len);
+	pd_rcvd = cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) -
+		  sizeof(struct erdma_mpa_ext);
+	to_rcv = pd_len - pd_rcvd;
+
+	if (!to_rcv) {
+		/*
+		 * We have received the whole MPA Request/Reply message.
+		 * Check against peer protocol violation.
+		 */
+		u32 word;
+
+		ret = __recv_mpa_hdr(cep, 0, (char *)&word, sizeof(word),
+				     &rcvd);
+		if (ret == -EAGAIN && rcvd == 0)
+			return 0;
+
+		if (ret)
+			return ret;
+
+		return -EPROTO;
+	}
+
+	/*
+	 * At this point, MPA header has been fully received, and pd_len != 0.
+	 * So, begin to receive private data.
+	 */
+	if (!cep->mpa.pdata) {
+		cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
+		if (!cep->mpa.pdata)
+			return -ENOMEM;
+	}
+
+	rcvd = ksock_recv(s, cep->mpa.pdata + pd_rcvd, to_rcv + 4,
+			  MSG_DONTWAIT);
+	if (rcvd < 0)
+		return rcvd;
+
+	if (rcvd > to_rcv)
+		return -EPROTO;
+
+	cep->mpa.bytes_rcvd += rcvd;
+
+	if (to_rcv == rcvd)
+		return 0;
+
+	return -EAGAIN;
+}
+
+/*
+ * erdma_proc_mpareq()
+ *
+ * Read MPA Request from socket and signal new connection to IWCM
+ * if success. Caller must hold lock on corresponding listening CEP.
+ */
+static int erdma_proc_mpareq(struct erdma_cep *cep)
+{
+	struct mpa_rr *req;
+	int ret;
+
+	ret = erdma_recv_mpa_rr(cep);
+	if (ret)
+		return ret;
+
+	req = &cep->mpa.hdr;
+
+	if (memcmp(req->key, MPA_KEY_REQ, MPA_KEY_SIZE))
+		return -EPROTO;
+
+	memcpy(req->key, MPA_KEY_REP, MPA_KEY_SIZE);
+
+	/* Currently does not support marker and crc. */
+	if (req->params.bits & MPA_RR_FLAG_MARKERS ||
+	    req->params.bits & MPA_RR_FLAG_CRC)
+		goto reject_conn;
+
+	cep->state = ERDMA_EPSTATE_RECVD_MPAREQ;
+
+	/* Keep reference until IWCM accepts/rejects */
+	erdma_cep_get(cep);
+	ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
+	if (ret)
+		erdma_cep_put(cep);
+
+	return ret;
+
+reject_conn:
+	req->params.bits &= ~MPA_RR_FLAG_MARKERS;
+	req->params.bits |= MPA_RR_FLAG_REJECT;
+	req->params.bits &= ~MPA_RR_FLAG_CRC;
+
+	kfree(cep->mpa.pdata);
+	cep->mpa.pdata = NULL;
+	erdma_send_mpareqrep(cep, NULL, 0);
+
+	return -EOPNOTSUPP;
+}
+
+static int erdma_proc_mpareply(struct erdma_cep *cep)
+{
+	struct erdma_qp_attrs qp_attrs;
+	struct erdma_qp *qp = cep->qp;
+	struct mpa_rr *rep;
+	int ret;
+
+	ret = erdma_recv_mpa_rr(cep);
+	if (ret != -EAGAIN)
+		erdma_cancel_mpatimer(cep);
+	if (ret)
+		goto out_err;
+
+	rep = &cep->mpa.hdr;
+
+	if (memcmp(rep->key, MPA_KEY_REP, MPA_KEY_SIZE)) {
+		ret = -EPROTO;
+		goto out_err;
+	}
+
+	if (rep->params.bits & MPA_RR_FLAG_REJECT) {
+		erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
+		return -ECONNRESET;
+	}
+
+	/* Currently does not support marker and crc. */
+	if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
+	    (rep->params.bits & MPA_RR_FLAG_CRC)) {
+		erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
+		return -EINVAL;
+	}
+
+	memset(&qp_attrs, 0, sizeof(qp_attrs));
+	qp_attrs.irq_size = cep->ird;
+	qp_attrs.orq_size = cep->ord;
+	qp_attrs.state = ERDMA_QP_STATE_RTS;
+
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > ERDMA_QP_STATE_RTR) {
+		ret = -EINVAL;
+		up_write(&qp->state_lock);
+		goto out_err;
+	}
+
+	qp->attrs.qp_type = ERDMA_QP_ACTIVE;
+	if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc)
+		qp->attrs.cc = COMPROMISE_CC;
+
+	ret = erdma_modify_qp_internal(qp, &qp_attrs,
+				       ERDMA_QP_ATTR_STATE |
+				       ERDMA_QP_ATTR_LLP_HANDLE |
+				       ERDMA_QP_ATTR_MPA);
+
+	up_write(&qp->state_lock);
+
+	if (!ret) {
+		ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
+		if (!ret)
+			cep->state = ERDMA_EPSTATE_RDMA_MODE;
+
+		return 0;
+	}
+
+out_err:
+	erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
+	return ret;
+}
+
+static void erdma_accept_newconn(struct erdma_cep *cep)
+{
+	struct socket *s = cep->sock;
+	struct socket *new_s = NULL;
+	struct erdma_cep *new_cep = NULL;
+	int ret = 0;
+
+	if (cep->state != ERDMA_EPSTATE_LISTENING)
+		goto error;
+
+	new_cep = erdma_cep_alloc(cep->dev);
+	if (!new_cep)
+		goto error;
+
+	/*
+	 * 4: Allocate a sufficient number of work elements
+	 * to allow concurrent handling of local + peer close
+	 * events, MPA header processing + MPA timeout.
+	 */
+	if (erdma_cm_alloc_work(new_cep, 4) != 0)
+		goto error;
+
+	/*
+	 * Copy saved socket callbacks from listening CEP
+	 * and assign new socket with new CEP
+	 */
+	new_cep->sk_state_change = cep->sk_state_change;
+	new_cep->sk_data_ready = cep->sk_data_ready;
+	new_cep->sk_error_report = cep->sk_error_report;
+
+	ret = kernel_accept(s, &new_s, O_NONBLOCK);
+	if (ret != 0)
+		goto error;
+
+	new_cep->sock = new_s;
+	erdma_cep_get(new_cep);
+	new_s->sk->sk_user_data = new_cep;
+
+	tcp_sock_set_nodelay(new_s->sk);
+	new_cep->state = ERDMA_EPSTATE_AWAIT_MPAREQ;
+
+	ret = erdma_cm_queue_work(new_cep, ERDMA_CM_WORK_MPATIMEOUT);
+	if (ret)
+		goto error;
+
+	new_cep->listen_cep = cep;
+	erdma_cep_get(cep);
+
+	if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
+		/* MPA REQ already queued */
+		erdma_cep_set_inuse(new_cep);
+		ret = erdma_proc_mpareq(new_cep);
+		if (ret != -EAGAIN) {
+			erdma_cep_put(cep);
+			new_cep->listen_cep = NULL;
+			if (ret) {
+				erdma_cep_set_free(new_cep);
+				goto error;
+			}
+		}
+		erdma_cep_set_free(new_cep);
+	}
+	return;
+
+error:
+	if (new_cep) {
+		new_cep->state = ERDMA_EPSTATE_CLOSED;
+		erdma_cancel_mpatimer(new_cep);
+
+		erdma_cep_put(new_cep);
+		new_cep->sock = NULL;
+	}
+
+	if (new_s) {
+		erdma_socket_disassoc(new_s);
+		sock_release(new_s);
+	}
+}
+
+static int erdma_newconn_connected(struct erdma_cep *cep)
+{
+	int ret = 0;
+
+	cep->mpa.hdr.params.bits = 0;
+	__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1);
+
+	memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE);
+	cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie);
+	__mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc);
+
+	ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len);
+	cep->state = ERDMA_EPSTATE_AWAIT_MPAREP;
+	cep->mpa.hdr.params.pd_len = 0;
+
+	if (ret >= 0)
+		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_MPATIMEOUT);
+
+	return ret;
+}
+
+static void erdma_cm_work_handler(struct work_struct *w)
+{
+	struct erdma_cm_work *work;
+	struct erdma_cep *cep;
+	int release_cep = 0, ret = 0;
+
+	work = container_of(w, struct erdma_cm_work, work.work);
+	cep = work->cep;
+
+	erdma_cep_set_inuse(cep);
+
+	switch (work->type) {
+	case ERDMA_CM_WORK_CONNECTED:
+		erdma_cancel_mpatimer(cep);
+		if (cep->state == ERDMA_EPSTATE_CONNECTING) {
+			ret = erdma_newconn_connected(cep);
+			if (ret) {
+				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+						-EIO);
+				release_cep = 1;
+			}
+		}
+		break;
+	case ERDMA_CM_WORK_CONNECTTIMEOUT:
+		if (cep->state == ERDMA_EPSTATE_CONNECTING) {
+			cep->mpa_timer = NULL;
+			erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+					-ETIMEDOUT);
+			release_cep = 1;
+		}
+		break;
+	case ERDMA_CM_WORK_ACCEPT:
+		erdma_accept_newconn(cep);
+		break;
+	case ERDMA_CM_WORK_READ_MPAHDR:
+		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
+			if (cep->listen_cep) {
+				erdma_cep_set_inuse(cep->listen_cep);
+
+				if (cep->listen_cep->state ==
+				    ERDMA_EPSTATE_LISTENING)
+					ret = erdma_proc_mpareq(cep);
+				else
+					ret = -EFAULT;
+
+				erdma_cep_set_free(cep->listen_cep);
+
+				if (ret != -EAGAIN) {
+					erdma_cep_put(cep->listen_cep);
+					cep->listen_cep = NULL;
+					if (ret)
+						erdma_cep_put(cep);
+				}
+			}
+		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
+			ret = erdma_proc_mpareply(cep);
+		}
+
+		if (ret && ret != -EAGAIN)
+			release_cep = 1;
+		break;
+	case ERDMA_CM_WORK_CLOSE_LLP:
+		if (cep->cm_id)
+			erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+		release_cep = 1;
+		break;
+	case ERDMA_CM_WORK_PEER_CLOSE:
+		if (cep->cm_id) {
+			if (cep->state == ERDMA_EPSTATE_CONNECTING ||
+			    cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
+				/*
+				 * MPA reply not received, but connection drop
+				 */
+				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+						-ECONNRESET);
+			} else if (cep->state == ERDMA_EPSTATE_RDMA_MODE) {
+				/*
+				 * NOTE: IW_CM_EVENT_DISCONNECT is given just
+				 *       to transition IWCM into CLOSING.
+				 */
+				erdma_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
+				erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+			}
+		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
+			/* Socket close before MPA request received. */
+			erdma_disassoc_listen_cep(cep);
+			erdma_cep_put(cep);
+		}
+		release_cep = 1;
+		break;
+	case ERDMA_CM_WORK_MPATIMEOUT:
+		cep->mpa_timer = NULL;
+		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
+			/*
+			 * MPA request timed out:
+			 * Hide any partially received private data and signal
+			 * timeout
+			 */
+			cep->mpa.hdr.params.pd_len = 0;
+
+			if (cep->cm_id)
+				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+						-ETIMEDOUT);
+			release_cep = 1;
+		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
+			/* No MPA req received after peer TCP stream setup. */
+			erdma_disassoc_listen_cep(cep);
+
+			erdma_cep_put(cep);
+			release_cep = 1;
+		}
+		break;
+	default:
+		WARN(1, "Undefined CM work type: %d\n", work->type);
+	}
+
+	if (release_cep) {
+		erdma_cancel_mpatimer(cep);
+		cep->state = ERDMA_EPSTATE_CLOSED;
+		if (cep->qp) {
+			struct erdma_qp *qp = cep->qp;
+			/*
+			 * Serialize a potential race with application
+			 * closing the QP and calling erdma_qp_cm_drop()
+			 */
+			erdma_qp_get(qp);
+			erdma_cep_set_free(cep);
+
+			erdma_qp_llp_close(qp);
+			erdma_qp_put(qp);
+
+			erdma_cep_set_inuse(cep);
+			cep->qp = NULL;
+			erdma_qp_put(qp);
+		}
+
+		if (cep->sock) {
+			erdma_socket_disassoc(cep->sock);
+			sock_release(cep->sock);
+			cep->sock = NULL;
+		}
+
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+			if (cep->state != ERDMA_EPSTATE_LISTENING)
+				erdma_cep_put(cep);
+		}
+	}
+	erdma_cep_set_free(cep);
+	erdma_put_work(work);
+	erdma_cep_put(cep);
+}
+
+int erdma_cm_queue_work(struct erdma_cep *cep, enum erdma_work_type type)
+{
+	struct erdma_cm_work *work = erdma_get_work(cep);
+	unsigned long delay = 0;
+
+	if (!work)
+		return -ENOMEM;
+
+	work->type = type;
+	work->cep = cep;
+
+	erdma_cep_get(cep);
+
+	INIT_DELAYED_WORK(&work->work, erdma_cm_work_handler);
+
+	if (type == ERDMA_CM_WORK_MPATIMEOUT) {
+		cep->mpa_timer = work;
+
+		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP)
+			delay = MPAREP_TIMEOUT;
+		else
+			delay = MPAREQ_TIMEOUT;
+	} else if (type == ERDMA_CM_WORK_CONNECTTIMEOUT) {
+		cep->mpa_timer = work;
+
+		delay = CONNECT_TIMEOUT;
+	}
+
+	queue_delayed_work(erdma_cm_wq, &work->work, delay);
+
+	return 0;
+}
+
+static void erdma_cm_llp_data_ready(struct sock *sk)
+{
+	struct erdma_cep *cep;
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep)
+		goto out;
+
+	if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ ||
+	    cep->state == ERDMA_EPSTATE_AWAIT_MPAREP)
+		erdma_cm_queue_work(cep, ERDMA_CM_WORK_READ_MPAHDR);
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static void erdma_cm_llp_error_report(struct sock *sk)
+{
+	struct erdma_cep *cep = sk_to_cep(sk);
+
+	if (cep)
+		cep->sk_error_report(sk);
+}
+
+static void erdma_cm_llp_state_change(struct sock *sk)
+{
+	struct erdma_cep *cep;
+	void (*orig_state_change)(struct sock *sk);
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep) {
+		read_unlock(&sk->sk_callback_lock);
+		return;
+	}
+	orig_state_change = cep->sk_state_change;
+
+	switch (sk->sk_state) {
+	case TCP_ESTABLISHED:
+		if (cep->state == ERDMA_EPSTATE_CONNECTING)
+			erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED);
+		else
+			erdma_cm_queue_work(cep, ERDMA_CM_WORK_ACCEPT);
+		break;
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+		if (cep->state != ERDMA_EPSTATE_LISTENING)
+			erdma_cm_queue_work(cep, ERDMA_CM_WORK_PEER_CLOSE);
+		break;
+	default:
+		break;
+	}
+	read_unlock(&sk->sk_callback_lock);
+	orig_state_change(sk);
+}
+
+static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
+			      int laddrlen, struct sockaddr *raddr,
+			      int raddrlen, int flags)
+{
+	int ret;
+
+	sock_set_reuseaddr(s->sk);
+	ret = s->ops->bind(s, laddr, laddrlen);
+	if (ret)
+		return ret;
+	ret = s->ops->connect(s, raddr, raddrlen, flags);
+	return ret < 0 ? ret : 0;
+}
+
+int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+	struct erdma_dev *dev = to_edev(id->device);
+	struct erdma_qp *qp;
+	struct erdma_cep *cep = NULL;
+	struct socket *s = NULL;
+	struct sockaddr *laddr = (struct sockaddr *)&id->m_local_addr;
+	struct sockaddr *raddr = (struct sockaddr *)&id->m_remote_addr;
+	u16 pd_len = params->private_data_len;
+	int ret;
+
+	if (pd_len > MPA_MAX_PRIVDATA)
+		return -EINVAL;
+
+	if (params->ird > dev->attrs.max_ird ||
+	    params->ord > dev->attrs.max_ord)
+		return -EINVAL;
+
+	if (laddr->sa_family != AF_INET || raddr->sa_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	qp = find_qp_by_qpn(dev, params->qpn);
+	if (!qp)
+		return -ENOENT;
+	erdma_qp_get(qp);
+
+	ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s);
+	if (ret < 0)
+		goto error_put_qp;
+
+	cep = erdma_cep_alloc(dev);
+	if (!cep) {
+		ret = -ENOMEM;
+		goto error_release_sock;
+	}
+
+	erdma_cep_set_inuse(cep);
+
+	/* Associate QP with CEP */
+	erdma_cep_get(cep);
+	qp->cep = cep;
+	cep->qp = qp;
+
+	/* Associate cm_id with CEP */
+	id->add_ref(id);
+	cep->cm_id = id;
+
+	/*
+	 * 6: Allocate a sufficient number of work elements
+	 * to allow concurrent handling of local + peer close
+	 * events, MPA header processing + MPA timeout, connected event
+	 * and connect timeout.
+	 */
+	ret = erdma_cm_alloc_work(cep, 6);
+	if (ret != 0) {
+		ret = -ENOMEM;
+		goto error_release_cep;
+	}
+
+	cep->ird = params->ird;
+	cep->ord = params->ord;
+	cep->state = ERDMA_EPSTATE_CONNECTING;
+
+	erdma_cep_socket_assoc(cep, s);
+
+	if (pd_len) {
+		cep->pd_len = pd_len;
+		cep->private_data = kmalloc(pd_len, GFP_KERNEL);
+		if (!cep->private_data) {
+			ret = -ENOMEM;
+			goto error_disassoc;
+		}
+
+		memcpy(cep->private_data, params->private_data,
+		       params->private_data_len);
+	}
+
+	ret = kernel_bindconnect(s, laddr, sizeof(*laddr), raddr,
+				 sizeof(*raddr), O_NONBLOCK);
+	if (ret != -EINPROGRESS && ret != 0) {
+		goto error_disassoc;
+	} else if (ret == 0) {
+		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED);
+		if (ret)
+			goto error_disassoc;
+	} else {
+		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTTIMEOUT);
+		if (ret)
+			goto error_disassoc;
+	}
+
+	erdma_cep_set_free(cep);
+	return 0;
+
+error_disassoc:
+	kfree(cep->private_data);
+	cep->private_data = NULL;
+	cep->pd_len = 0;
+
+	erdma_socket_disassoc(s);
+
+error_release_cep:
+	/* disassoc with cm_id */
+	cep->cm_id = NULL;
+	id->rem_ref(id);
+
+	/* disassoc with qp */
+	qp->cep = NULL;
+	erdma_cep_put(cep);
+	cep->qp = NULL;
+
+	cep->state = ERDMA_EPSTATE_CLOSED;
+
+	erdma_cep_set_free(cep);
+
+	/* release the cep. */
+	erdma_cep_put(cep);
+
+error_release_sock:
+	if (s)
+		sock_release(s);
+error_put_qp:
+	erdma_qp_put(qp);
+
+	return ret;
+}
+
+int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+	struct erdma_dev *dev = to_edev(id->device);
+	struct erdma_cep *cep = (struct erdma_cep *)id->provider_data;
+	struct erdma_qp *qp;
+	struct erdma_qp_attrs qp_attrs;
+	int ret;
+
+	erdma_cep_set_inuse(cep);
+	erdma_cep_put(cep);
+
+	/* Free lingering inbound private data */
+	if (cep->mpa.hdr.params.pd_len) {
+		cep->mpa.hdr.params.pd_len = 0;
+		kfree(cep->mpa.pdata);
+		cep->mpa.pdata = NULL;
+	}
+	erdma_cancel_mpatimer(cep);
+
+	if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) {
+		erdma_cep_set_free(cep);
+		erdma_cep_put(cep);
+
+		return -ECONNRESET;
+	}
+
+	qp = find_qp_by_qpn(dev, params->qpn);
+	if (!qp)
+		return -ENOENT;
+	erdma_qp_get(qp);
+
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > ERDMA_QP_STATE_RTR) {
+		ret = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+
+	if (params->ord > dev->attrs.max_ord ||
+	    params->ird > dev->attrs.max_ord) {
+		ret = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+
+	if (params->private_data_len > MPA_MAX_PRIVDATA) {
+		ret = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+
+	cep->ird = params->ird;
+	cep->ord = params->ord;
+
+	cep->cm_id = id;
+	id->add_ref(id);
+
+	memset(&qp_attrs, 0, sizeof(qp_attrs));
+	qp_attrs.orq_size = params->ord;
+	qp_attrs.irq_size = params->ird;
+
+	qp_attrs.state = ERDMA_QP_STATE_RTS;
+
+	/* Associate QP with CEP */
+	erdma_cep_get(cep);
+	qp->cep = cep;
+	cep->qp = qp;
+
+	cep->state = ERDMA_EPSTATE_RDMA_MODE;
+
+	qp->attrs.qp_type = ERDMA_QP_PASSIVE;
+	qp->attrs.pd_len = params->private_data_len;
+
+	if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits))
+		qp->attrs.cc = COMPROMISE_CC;
+
+	/* move to rts */
+	ret = erdma_modify_qp_internal(qp, &qp_attrs,
+				       ERDMA_QP_ATTR_STATE |
+				       ERDMA_QP_ATTR_ORD |
+				       ERDMA_QP_ATTR_LLP_HANDLE |
+				       ERDMA_QP_ATTR_IRD |
+				       ERDMA_QP_ATTR_MPA);
+	up_write(&qp->state_lock);
+
+	if (ret)
+		goto error;
+
+	cep->mpa.ext_data.bits = 0;
+	__mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc);
+	cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie);
+
+	ret = erdma_send_mpareqrep(cep, params->private_data,
+				   params->private_data_len);
+	if (!ret) {
+		ret = erdma_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+		if (ret)
+			goto error;
+
+		erdma_cep_set_free(cep);
+
+		return 0;
+	}
+
+error:
+	erdma_socket_disassoc(cep->sock);
+	sock_release(cep->sock);
+	cep->sock = NULL;
+
+	cep->state = ERDMA_EPSTATE_CLOSED;
+
+	if (cep->cm_id) {
+		cep->cm_id->rem_ref(id);
+		cep->cm_id = NULL;
+	}
+
+	if (qp->cep) {
+		erdma_cep_put(cep);
+		qp->cep = NULL;
+	}
+
+	cep->qp = NULL;
+	erdma_qp_put(qp);
+
+	erdma_cep_set_free(cep);
+	erdma_cep_put(cep);
+
+	return ret;
+}
+
+int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen)
+{
+	struct erdma_cep *cep = (struct erdma_cep *)id->provider_data;
+
+	erdma_cep_set_inuse(cep);
+	erdma_cep_put(cep);
+
+	erdma_cancel_mpatimer(cep);
+
+	if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) {
+		erdma_cep_set_free(cep);
+		erdma_cep_put(cep);
+
+		return -ECONNRESET;
+	}
+
+	if (__mpa_rr_revision(cep->mpa.hdr.params.bits) == MPA_REVISION_EXT_1) {
+		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
+		erdma_send_mpareqrep(cep, pdata, plen);
+	}
+
+	erdma_socket_disassoc(cep->sock);
+	sock_release(cep->sock);
+	cep->sock = NULL;
+
+	cep->state = ERDMA_EPSTATE_CLOSED;
+
+	erdma_cep_set_free(cep);
+	erdma_cep_put(cep);
+
+	return 0;
+}
+
+int erdma_create_listen(struct iw_cm_id *id, int backlog)
+{
+	struct socket *s;
+	struct erdma_cep *cep = NULL;
+	int ret = 0;
+	struct erdma_dev *dev = to_edev(id->device);
+	int addr_family = id->local_addr.ss_family;
+	struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
+
+	if (addr_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	ret = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
+	if (ret < 0)
+		return ret;
+
+	sock_set_reuseaddr(s->sk);
+
+	/* For wildcard addr, limit binding to current device only */
+	if (ipv4_is_zeronet(laddr->sin_addr.s_addr))
+		s->sk->sk_bound_dev_if = dev->netdev->ifindex;
+
+	ret = s->ops->bind(s, (struct sockaddr *)laddr,
+			   sizeof(struct sockaddr_in));
+	if (ret)
+		goto error;
+
+	cep = erdma_cep_alloc(dev);
+	if (!cep) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	erdma_cep_socket_assoc(cep, s);
+
+	ret = erdma_cm_alloc_work(cep, backlog);
+	if (ret)
+		goto error;
+
+	ret = s->ops->listen(s, backlog);
+	if (ret)
+		goto error;
+
+	cep->cm_id = id;
+	id->add_ref(id);
+
+	if (!id->provider_data) {
+		id->provider_data =
+			kmalloc(sizeof(struct list_head), GFP_KERNEL);
+		if (!id->provider_data) {
+			ret = -ENOMEM;
+			goto error;
+		}
+		INIT_LIST_HEAD((struct list_head *)id->provider_data);
+	}
+
+	list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
+	cep->state = ERDMA_EPSTATE_LISTENING;
+
+	return 0;
+
+error:
+	if (cep) {
+		erdma_cep_set_inuse(cep);
+
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+		}
+		cep->sock = NULL;
+		erdma_socket_disassoc(s);
+		cep->state = ERDMA_EPSTATE_CLOSED;
+
+		erdma_cep_set_free(cep);
+		erdma_cep_put(cep);
+	}
+	sock_release(s);
+
+	return ret;
+}
+
+static void erdma_drop_listeners(struct iw_cm_id *id)
+{
+	struct list_head *p, *tmp;
+	/*
+	 * In case of a wildcard rdma_listen on a multi-homed device,
+	 * a listener's IWCM id is associated with more than one listening CEP.
+	 */
+	list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
+		struct erdma_cep *cep =
+			list_entry(p, struct erdma_cep, listenq);
+
+		list_del(p);
+
+		erdma_cep_set_inuse(cep);
+
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+		}
+		if (cep->sock) {
+			erdma_socket_disassoc(cep->sock);
+			sock_release(cep->sock);
+			cep->sock = NULL;
+		}
+		cep->state = ERDMA_EPSTATE_CLOSED;
+		erdma_cep_set_free(cep);
+		erdma_cep_put(cep);
+	}
+}
+
+int erdma_destroy_listen(struct iw_cm_id *id)
+{
+	if (!id->provider_data)
+		return 0;
+
+	erdma_drop_listeners(id);
+	kfree(id->provider_data);
+	id->provider_data = NULL;
+
+	return 0;
+}
+
+int erdma_cm_init(void)
+{
+	erdma_cm_wq = create_singlethread_workqueue("erdma_cm_wq");
+	if (!erdma_cm_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void erdma_cm_exit(void)
+{
+	if (erdma_cm_wq)
+		destroy_workqueue(erdma_cm_wq);
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_cm.h b/drivers/infiniband/hw/erdma/erdma_cm.h
new file mode 100644
index 000000000000..040c05dd3625
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_cm.h
@@ -0,0 +1,167 @@ 
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/*          Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#ifndef __ERDMA_CM_H__
+#define __ERDMA_CM_H__
+
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <rdma/iw_cm.h>
+
+/* iWarp MPA protocol defs */
+#define MPA_REVISION_EXT_1 129
+#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+#define MPA_KEY_SIZE 16
+#define MPA_DEFAULT_HDR_LEN 28
+
+struct mpa_rr_params {
+	__be16 bits;
+	__be16 pd_len;
+};
+
+/*
+ * MPA request/response Hdr bits & fields
+ */
+enum {
+	MPA_RR_FLAG_MARKERS = __cpu_to_be16(0x8000),
+	MPA_RR_FLAG_CRC = __cpu_to_be16(0x4000),
+	MPA_RR_FLAG_REJECT = __cpu_to_be16(0x2000),
+	MPA_RR_RESERVED = __cpu_to_be16(0x1f00),
+	MPA_RR_MASK_REVISION = __cpu_to_be16(0x00ff)
+};
+
+/*
+ * MPA request/reply header
+ */
+struct mpa_rr {
+	u8 key[16];
+	struct mpa_rr_params params;
+};
+
+struct erdma_mpa_ext {
+	__be32 cookie;
+	__be32 bits;
+};
+
+enum {
+	MPA_EXT_FLAG_CC = cpu_to_be32(0x0000000f),
+};
+
+struct erdma_mpa_info {
+	struct mpa_rr hdr; /* peer mpa hdr in host byte order */
+	struct erdma_mpa_ext ext_data;
+	char *pdata;
+	int bytes_rcvd;
+};
+
+struct erdma_sk_upcalls {
+	void (*sk_state_change)(struct sock *sk);
+	void (*sk_data_ready)(struct sock *sk, int bytes);
+	void (*sk_error_report)(struct sock *sk);
+};
+
+struct erdma_dev;
+
+enum erdma_cep_state {
+	ERDMA_EPSTATE_IDLE = 1,
+	ERDMA_EPSTATE_LISTENING,
+	ERDMA_EPSTATE_CONNECTING,
+	ERDMA_EPSTATE_AWAIT_MPAREQ,
+	ERDMA_EPSTATE_RECVD_MPAREQ,
+	ERDMA_EPSTATE_AWAIT_MPAREP,
+	ERDMA_EPSTATE_RDMA_MODE,
+	ERDMA_EPSTATE_CLOSED
+};
+
+struct erdma_cep {
+	struct iw_cm_id *cm_id;
+	struct erdma_dev *dev;
+	struct list_head devq;
+	spinlock_t lock;
+	struct kref ref;
+	int in_use;
+	wait_queue_head_t waitq;
+	enum erdma_cep_state state;
+
+	struct list_head listenq;
+	struct erdma_cep *listen_cep;
+
+	struct erdma_qp *qp;
+	struct socket *sock;
+
+	struct erdma_cm_work *mpa_timer;
+	struct list_head work_freelist;
+
+	struct erdma_mpa_info mpa;
+	int ord;
+	int ird;
+
+	int pd_len;
+	/* hold user's private data. */
+	void *private_data;
+
+	/* Saved upcalls of socket llp.sock */
+	void (*sk_state_change)(struct sock *sk);
+	void (*sk_data_ready)(struct sock *sk);
+	void (*sk_error_report)(struct sock *sk);
+};
+
+#define MPAREQ_TIMEOUT (HZ * 20)
+#define MPAREP_TIMEOUT (HZ * 10)
+#define CONNECT_TIMEOUT (HZ * 10)
+
+enum erdma_work_type {
+	ERDMA_CM_WORK_ACCEPT = 1,
+	ERDMA_CM_WORK_READ_MPAHDR,
+	ERDMA_CM_WORK_CLOSE_LLP, /* close socket */
+	ERDMA_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
+	ERDMA_CM_WORK_MPATIMEOUT,
+	ERDMA_CM_WORK_CONNECTED,
+	ERDMA_CM_WORK_CONNECTTIMEOUT
+};
+
+struct erdma_cm_work {
+	struct delayed_work work;
+	struct list_head list;
+	enum erdma_work_type type;
+	struct erdma_cep *cep;
+};
+
+#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
+
+static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
+{
+	return s->ops->getname(s, (struct sockaddr *)a, 1);
+}
+
+static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
+{
+	return s->ops->getname(s, (struct sockaddr *)a, 0);
+}
+
+int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *param);
+int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
+int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen);
+int erdma_create_listen(struct iw_cm_id *id, int backlog);
+int erdma_destroy_listen(struct iw_cm_id *id);
+
+void erdma_cep_get(struct erdma_cep *ceq);
+void erdma_cep_put(struct erdma_cep *ceq);
+int erdma_cm_queue_work(struct erdma_cep *ceq, enum erdma_work_type type);
+
+int erdma_cm_init(void);
+void erdma_cm_exit(void);
+
+#define sk_to_cep(sk) ((struct erdma_cep *)((sk)->sk_user_data))
+
+#endif