diff mbox series

[v3,13/13] nvme-tcp: add NVMe over TCP host driver

Message ID 20181122015615.15763-14-sagi@grimberg.me (mailing list archive)
State New, archived
Headers show
Series TCP transport binding for NVMe over Fabrics | expand

Commit Message

Sagi Grimberg Nov. 22, 2018, 1:56 a.m. UTC
From: Sagi Grimberg <sagi@lightbitslabs.com>

This patch implements the NVMe over TCP host driver. It can be used to
connect to remote NVMe over Fabrics subsystems over good old TCP/IP.

The driver implements the TP 8000 of how nvme over fabrics capsules and
data are encapsulated in nvme-tcp pdus and exchaged on top of a TCP byte
stream. nvme-tcp header and data digest are supported as well.

To connect to all NVMe over Fabrics controllers reachable on a given taget
port over TCP use the following command:

	nvme connect-all -t tcp -a $IPADDR

This requires the latest version of nvme-cli with TCP support.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Roy Shterman <roys@lightbitslabs.com>
Signed-off-by: Solganik Alexander <sashas@lightbitslabs.com>
---
 drivers/nvme/host/Kconfig  |   15 +
 drivers/nvme/host/Makefile |    3 +
 drivers/nvme/host/tcp.c    | 2290 ++++++++++++++++++++++++++++++++++++
 3 files changed, 2308 insertions(+)
 create mode 100644 drivers/nvme/host/tcp.c

Comments

Christoph Hellwig Nov. 22, 2018, 8:02 a.m. UTC | #1
A few reandom nitpicks:

> +static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
> +	void *pdu, size_t pdu_len)

Please use two tabs for indenting prototype continuations

> +	len = le32_to_cpu(hdr->plen) - hdr->hlen -
> +		((hdr->flags & NVME_TCP_F_HDGST) ? nvme_tcp_hdgst_len(queue) : 0);

Overly long line.  But it would be much cleaner with a local digest_len
variable anyway.

> +static enum nvme_tcp_recv_state nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
> +{
> +	return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
> +		(queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
> +		NVME_TCP_RECV_DATA;
> +}

This just seems to be used in a single switch statement.  Why the detour
theough the state enum?

> +{
> +	struct request *rq;
> +	struct nvme_tcp_request *req;
> +
> +	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
> +	if (!rq) {
> +		dev_err(queue->ctrl->ctrl.device,
> +			"queue %d tag 0x%x not found\n",
> +			nvme_tcp_queue_id(queue), cqe->command_id);
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		return -EINVAL;
> +	}
> +	req = blk_mq_rq_to_pdu(rq);
> +
> +	nvme_end_request(rq, cqe->status, cqe->result);

req seems unused here.

> +			nvme_tcp_queue_id(queue), pdu->command_id);
> +		return -ENOENT;
> +	}
> +	req = blk_mq_rq_to_pdu(rq);
> +
> +	if (!blk_rq_payload_bytes(rq)) {
> +		dev_err(queue->ctrl->ctrl.device,
> +			"queue %d tag %#x unexpected data\n",
> +			nvme_tcp_queue_id(queue), rq->tag);
> +		return -EIO;
> +	}
> +
> +	queue->data_remaining = le32_to_cpu(pdu->data_length);
> +	/* No support for out-of-order */
> +	WARN_ON(le32_to_cpu(pdu->data_offset));
> +
> +	return 0;

And here as well.

Also can we just WARN_ON on the offset?  

> +	ret = skb_copy_bits(skb, *offset,
> +		&queue->pdu[queue->pdu_offset], rcv_len);

More of this can go on th first line.

> +	if (unlikely(ret))
> +		return ret;
> +
> +	queue->pdu_remaining -= rcv_len;
> +	queue->pdu_offset += rcv_len;
> +	*offset += rcv_len;
> +	*len -= rcv_len;
> +	if (queue->pdu_remaining)
> +		return 0;
> +
> +	hdr = (void *)queue->pdu;

hdr is a struct nvme_tcp_hdr *, please use the right cast if we have
to cast - but then again queue->pdu probably should be a void pointer
so that we can use it everywhere without casts.

> +static void nvme_tcp_init_recv_iter(struct nvme_tcp_request *req)
> +{
> +	struct bio *bio = req->curr_bio;
> +	struct bio_vec *vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
> +	unsigned int nsegs = bio_segments(bio);
> +
> +	iov_iter_bvec(&req->iter, READ, vec, nsegs,
> +		bio->bi_iter.bi_size);
> +	req->iter.iov_offset = bio->bi_iter.bi_bvec_done;

This code seems largely identical to that in nvme_tcp_init_send_iter
except for passing READ vs WRITE.  Please use a common helper.

> +		/*
> +		 * FIXME: This assumes that data comes in-order,
> +		 *  need to handle the out-of-order case.
> +		 */

That sounds like something we should really address before merging.

> +	read_lock(&sk->sk_callback_lock);
> +	queue = sk->sk_user_data;
> +	if (unlikely(!queue || !queue->rd_enabled))
> +		goto done;
> +
> +	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
> +done:
> +	read_unlock(&sk->sk_callback_lock);

Don't we need a rcu_dereference_sk_user_data here?

Also why not:

	queue = rcu_dereference_sk_user_data(sk);
	if (likely(queue && queue->rd_enabled))
		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
	read_unlock(&sk->sk_callback_lock);

	
> +static void nvme_tcp_write_space(struct sock *sk)
> +{
> +	struct nvme_tcp_queue *queue;
> +
> +	read_lock_bh(&sk->sk_callback_lock);
> +	queue = sk->sk_user_data;
> +
> +	if (!queue)
> +		goto done;
> +
> +	if (sk_stream_is_writeable(sk)) {
> +		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
> +		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
> +	}
> +done:
> +	read_unlock_bh(&sk->sk_callback_lock);

Same here:

	queue = rcu_dereference_sk_user_data(sk);
	if (queue && sk_stream_is_writeable(sk)) {
		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
	}
	read_unlock(&sk->sk_callback_lock);

(there are a few more places where rcu_dereference_sk_user_data should
be used, skipping them now).

> +static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
> +{
> +	queue->request = NULL;
> +}
> +
> +static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
> +{
> +	union nvme_result res = {};
> +
> +	nvme_end_request(blk_mq_rq_from_pdu(req),
> +		NVME_SC_DATA_XFER_ERROR, res);

This looks like odd formatting, needs one more tab.  But
NVME_SC_DATA_XFER_ERROR is also generally a status that should be
returned from the nvme controller, not made up on the host.


> +		if (queue->data_digest)
> +			nvme_tcp_ddgst_update(queue->snd_hash, page, offset, ret);

Overly long line, please stick to 80 characters.

> +	if (req->state == NVME_TCP_SEND_CMD_PDU) {
> +		ret = nvme_tcp_try_send_cmd_pdu(req);
> +		if (ret <= 0)
> +			goto done;
> +		if (!nvme_tcp_has_inline_data(req))
> +			return ret;
> +	}
> +
> +	if (req->state == NVME_TCP_SEND_H2C_PDU) {
> +		ret = nvme_tcp_try_send_data_pdu(req);
> +		if (ret <= 0)
> +			goto done;
> +	}
> +
> +	if (req->state == NVME_TCP_SEND_DATA) {
> +		ret = nvme_tcp_try_send_data(req);
> +		if (ret <= 0)
> +			goto done;
> +	}
> +
> +	if (req->state == NVME_TCP_SEND_DDGST)
> +		ret = nvme_tcp_try_send_ddgst(req);

Use a switch statement here?

> +static void nvme_tcp_free_tagset(struct nvme_ctrl *nctrl,
> +		struct blk_mq_tag_set *set)
> +{
> +	blk_mq_free_tag_set(set);
> +}

Please drop this wrapper.

> +static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
> +		bool admin)
> +{

This function does two entirely different things based on the admin
paramter.

> +static void nvme_tcp_stop_admin_queue(struct nvme_ctrl *ctrl)
> +{
> +	nvme_tcp_stop_queue(ctrl, 0);
> +}

This wrapper seems a bit pointless.

> +static int nvme_tcp_start_admin_queue(struct nvme_ctrl *ctrl)
> +{
> +	return nvme_tcp_start_queue(ctrl, 0);
> +}

Same here.

> +int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)

Shouldn't this (or anything in this file for that matter) be static?

> +	if (ctrl->queue_count > 1) {
> +		nvme_stop_queues(ctrl);
> +		nvme_tcp_stop_io_queues(ctrl);
> +		blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
> +		if (remove)
> +			nvme_start_queues(ctrl);
> +		nvme_tcp_destroy_io_queues(ctrl, remove);
> +	}

Overly long line above.  Could be easily solved with an early return..

> +static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
> +{
> +	nvme_tcp_teardown_ctrl(ctrl, true);
> +}

Pointless wrapper.

> +static void nvme_tcp_set_sg_null(struct nvme_command *c)
> +{
> +	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
> +
> +	sg->addr = 0;
> +	sg->length = 0;
> +	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
> +			NVME_SGL_FMT_TRANSPORT_A;
> +}
> +
> +static void nvme_tcp_set_sg_host_data(struct nvme_tcp_request *req,
> +		struct nvme_command *c)
> +{
> +	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
> +
> +	sg->addr = 0;
> +	sg->length = cpu_to_le32(req->data_len);
> +	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
> +			NVME_SGL_FMT_TRANSPORT_A;
> +}

Do we really need nvme_tcp_set_sg_null?  Any command it is called
on should have a request with a 0 length, so it could use
nvme_tcp_set_sg_host_data.

> +static enum blk_eh_timer_return
> +nvme_tcp_timeout(struct request *rq, bool reserved)
> +{
> +	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> +	struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
> +	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
> +
> +	dev_dbg(ctrl->ctrl.device,
> +		"queue %d: timeout request %#x type %d\n",
> +		nvme_tcp_queue_id(req->queue), rq->tag,
> +		pdu->hdr.type);
> +
> +	if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
> +		union nvme_result res = {};
> +
> +		nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
> +		nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
> +		return BLK_EH_DONE;

This looks odd.  It's not really the timeout handlers job to
call nvme_end_request here.


> +	if (rq_data_dir(rq) == WRITE) {
> +		req->curr_bio = rq->bio;
> +		if (req->data_len <= nvme_tcp_inline_data_size(queue))
> +			req->pdu_len = req->data_len;
> +	} else {
> +		req->curr_bio = rq->bio;
> +		if (req->curr_bio)
> +			nvme_tcp_init_recv_iter(req);
> +	}

The curr_bio setup is duplicated in both branches.
Sagi Grimberg Nov. 25, 2018, 9:10 a.m. UTC | #2
>> +static enum nvme_tcp_recv_state nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
>> +{
>> +	return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
>> +		(queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
>> +		NVME_TCP_RECV_DATA;
>> +}
> 
> This just seems to be used in a single switch statement.  Why the detour
> theough the state enum?

I think its clearer that the calling switch statement is switching on
an explicit state...

I can add the queue an explicit recv_state that would transition
these states like the target does, would that be better?

>> +		/*
>> +		 * FIXME: This assumes that data comes in-order,
>> +		 *  need to handle the out-of-order case.
>> +		 */
> 
> That sounds like something we should really address before merging.

That is an old comment from the first days where the
spec didn't explicitly state that data is transferred in
order for a single request. Will drop the comment.

>> +	read_lock(&sk->sk_callback_lock);
>> +	queue = sk->sk_user_data;
>> +	if (unlikely(!queue || !queue->rd_enabled))
>> +		goto done;
>> +
>> +	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
>> +done:
>> +	read_unlock(&sk->sk_callback_lock);
> 
> Don't we need a rcu_dereference_sk_user_data here?

If I'm protected with the sk_callback_lock I don't need
it do I? I wander if I can remove the sk_callback_lock
and move to rcu only? That would require careful look
as when I change the callbacks I need to synchronize rcu
before clearing sk_user_data..

It seems that only tunneling ulps are using it so I'm not
sure that the actual user should use it...

> Also why not:
> 
> 	queue = rcu_dereference_sk_user_data(sk);
> 	if (likely(queue && queue->rd_enabled))
> 		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
> 	read_unlock(&sk->sk_callback_lock);

That I'll change...

>> +static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
>> +{
>> +	union nvme_result res = {};
>> +
>> +	nvme_end_request(blk_mq_rq_from_pdu(req),
>> +		NVME_SC_DATA_XFER_ERROR, res);
> 
> This looks like odd formatting, needs one more tab.  But
> NVME_SC_DATA_XFER_ERROR is also generally a status that should be
> returned from the nvme controller, not made up on the host.

Well.. the driver did fail to transfer data... What would be a
better completion status then?

>> +	if (req->state == NVME_TCP_SEND_CMD_PDU) {
>> +		ret = nvme_tcp_try_send_cmd_pdu(req);
>> +		if (ret <= 0)
>> +			goto done;
>> +		if (!nvme_tcp_has_inline_data(req))
>> +			return ret;
>> +	}
>> +
>> +	if (req->state == NVME_TCP_SEND_H2C_PDU) {
>> +		ret = nvme_tcp_try_send_data_pdu(req);
>> +		if (ret <= 0)
>> +			goto done;
>> +	}
>> +
>> +	if (req->state == NVME_TCP_SEND_DATA) {
>> +		ret = nvme_tcp_try_send_data(req);
>> +		if (ret <= 0)
>> +			goto done;
>> +	}
>> +
>> +	if (req->state == NVME_TCP_SEND_DDGST)
>> +		ret = nvme_tcp_try_send_ddgst(req);
> 
> Use a switch statement here?

The code flow is expected to fallthru as the command sequence continues
such that I don't need to "re-call" the routine...

For example, for in-capsule write, we will start in SEND_CMD_PDU,
continue to SEND_DATA and then to SEND_DDGST (if data digest exists)..

>> +static void nvme_tcp_free_tagset(struct nvme_ctrl *nctrl,
>> +		struct blk_mq_tag_set *set)
>> +{
>> +	blk_mq_free_tag_set(set);
>> +}
> 
> Please drop this wrapper.
> 
>> +static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
>> +		bool admin)
>> +{
> 
> This function does two entirely different things based on the admin
> paramter.

These two were left as such from my attempts to converge some
code over in the core.. I can remove them if you insist...

>> +int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
> 
> Shouldn't this (or anything in this file for that matter) be static?

Again, leftovers from my attempts to converge code...

>> +static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
>> +{
>> +	nvme_tcp_teardown_ctrl(ctrl, true);
>> +}
> 
> Pointless wrapper.

nvme_tcp_delete_ctrl() is a callback.

>> +static void nvme_tcp_set_sg_null(struct nvme_command *c)
>> +{
>> +	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
>> +
>> +	sg->addr = 0;
>> +	sg->length = 0;
>> +	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
>> +			NVME_SGL_FMT_TRANSPORT_A;
>> +}
>> +
>> +static void nvme_tcp_set_sg_host_data(struct nvme_tcp_request *req,
>> +		struct nvme_command *c)
>> +{
>> +	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
>> +
>> +	sg->addr = 0;
>> +	sg->length = cpu_to_le32(req->data_len);
>> +	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
>> +			NVME_SGL_FMT_TRANSPORT_A;
>> +}
> 
> Do we really need nvme_tcp_set_sg_null?  Any command it is called
> on should have a request with a 0 length, so it could use
> nvme_tcp_set_sg_host_data.

We don't..

>> +static enum blk_eh_timer_return
>> +nvme_tcp_timeout(struct request *rq, bool reserved)
>> +{
>> +	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>> +	struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
>> +	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
>> +
>> +	dev_dbg(ctrl->ctrl.device,
>> +		"queue %d: timeout request %#x type %d\n",
>> +		nvme_tcp_queue_id(req->queue), rq->tag,
>> +		pdu->hdr.type);
>> +
>> +	if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
>> +		union nvme_result res = {};
>> +
>> +		nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
>> +		nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
>> +		return BLK_EH_DONE;
> 
> This looks odd.  It's not really the timeout handlers job to
> call nvme_end_request here.

Well.. if we are not yet LIVE, we will not trigger error
recovery, which means nothing will complete this command so
something needs to do it...

I think that we need it for rdma too..

...

The rest of the comments will be addressed in the next submission..
Max Gurtovoy Nov. 27, 2018, 12:05 a.m. UTC | #3
+static enum blk_eh_timer_return
>>> +nvme_tcp_timeout(struct request *rq, bool reserved)
>>> +{
>>> +    struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>>> +    struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
>>> +    struct nvme_tcp_cmd_pdu *pdu = req->pdu;
>>> +
>>> +    dev_dbg(ctrl->ctrl.device,
>>> +        "queue %d: timeout request %#x type %d\n",
>>> +        nvme_tcp_queue_id(req->queue), rq->tag,
>>> +        pdu->hdr.type);
>>> +
>>> +    if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
>>> +        union nvme_result res = {};
>>> +
>>> +        nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
>>> +        nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
>>> +        return BLK_EH_DONE;
>>
>> This looks odd.  It's not really the timeout handlers job to
>> call nvme_end_request here.
>
> Well.. if we are not yet LIVE, we will not trigger error
> recovery, which means nothing will complete this command so
> something needs to do it...
>
> I think that we need it for rdma too..


yes we do. and we've patches in our pipe.

I'm thinking on a wider change in the error/recovery flows but might 
send it "as is" meanwhile.


>
> ...
>
> The rest of the comments will be addressed in the next submission..
Sagi Grimberg Nov. 27, 2018, 7:48 a.m. UTC | #4
>>> This looks odd.  It's not really the timeout handlers job to
>>> call nvme_end_request here.
>>
>> Well.. if we are not yet LIVE, we will not trigger error
>> recovery, which means nothing will complete this command so
>> something needs to do it...
>>
>> I think that we need it for rdma too..
> 
> 
> yes we do. and we've patches in our pipe.
> 
> I'm thinking on a wider change in the error/recovery flows but might 
> send it "as is" meanwhile.

I have it too in the pipe. Do you want to send it out or should I?
Max Gurtovoy Nov. 27, 2018, 10:20 a.m. UTC | #5
On 11/27/2018 9:48 AM, Sagi Grimberg wrote:
>
>>>> This looks odd.  It's not really the timeout handlers job to
>>>> call nvme_end_request here.
>>>
>>> Well.. if we are not yet LIVE, we will not trigger error
>>> recovery, which means nothing will complete this command so
>>> something needs to do it...
>>>
>>> I think that we need it for rdma too..
>>
>>
>> yes we do. and we've patches in our pipe.
>>
>> I'm thinking on a wider change in the error/recovery flows but might 
>> send it "as is" meanwhile.
>
> I have it too in the pipe. Do you want to send it out or should I?


go for it. We've several patches that are under testing and review 
currently.
diff mbox series

Patch

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 88a8b5916624..0f345e207675 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -57,3 +57,18 @@  config NVME_FC
 	  from https://github.com/linux-nvme/nvme-cli.
 
 	  If unsure, say N.
+
+config NVME_TCP
+	tristate "NVM Express over Fabrics TCP host driver"
+	depends on INET
+	depends on BLK_DEV_NVME
+	select NVME_FABRICS
+	help
+	  This provides support for the NVMe over Fabrics protocol using
+	  the TCP transport.  This allows you to use remote block devices
+	  exported using the NVMe protocol set.
+
+	  To configure a NVMe over Fabrics controller use the nvme-cli tool
+	  from https://github.com/linux-nvme/nvme-cli.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index aea459c65ae1..8a4b671c5f0c 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -7,6 +7,7 @@  obj-$(CONFIG_BLK_DEV_NVME)		+= nvme.o
 obj-$(CONFIG_NVME_FABRICS)		+= nvme-fabrics.o
 obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
 obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
+obj-$(CONFIG_NVME_TCP)			+= nvme-tcp.o
 
 nvme-core-y				:= core.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
@@ -21,3 +22,5 @@  nvme-fabrics-y				+= fabrics.o
 nvme-rdma-y				+= rdma.o
 
 nvme-fc-y				+= fc.o
+
+nvme-tcp-y				+= tcp.o
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
new file mode 100644
index 000000000000..5d8409854a25
--- /dev/null
+++ b/drivers/nvme/host/tcp.c
@@ -0,0 +1,2290 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP host.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/blk-mq.h>
+#include <crypto/hash.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+
+struct nvme_tcp_queue;
+
+enum nvme_tcp_send_state {
+	NVME_TCP_SEND_CMD_PDU = 0,
+	NVME_TCP_SEND_H2C_PDU,
+	NVME_TCP_SEND_DATA,
+	NVME_TCP_SEND_DDGST,
+};
+
+struct nvme_tcp_request {
+	struct nvme_request	req;
+	void			*pdu;
+	struct nvme_tcp_queue	*queue;
+	u32			data_len;
+	u32			pdu_len;
+	u32			pdu_sent;
+	u16			ttag;
+	struct list_head	entry;
+	u32			ddgst;
+
+	struct bio		*curr_bio;
+	struct iov_iter		iter;
+
+	/* send state */
+	size_t			offset;
+	size_t			data_sent;
+	enum nvme_tcp_send_state state;
+};
+
+enum nvme_tcp_queue_flags {
+	NVME_TCP_Q_ALLOCATED	= 0,
+	NVME_TCP_Q_LIVE		= 1,
+};
+
+enum nvme_tcp_recv_state {
+	NVME_TCP_RECV_PDU = 0,
+	NVME_TCP_RECV_DATA,
+	NVME_TCP_RECV_DDGST,
+};
+
+struct nvme_tcp_ctrl;
+struct nvme_tcp_queue {
+	struct socket		*sock;
+	struct work_struct	io_work;
+	int			io_cpu;
+
+	spinlock_t		lock;
+	struct list_head	send_list;
+
+	/* recv state */
+	char			*pdu;
+	int			pdu_remaining;
+	int			pdu_offset;
+	size_t			data_remaining;
+	size_t			ddgst_remaining;
+
+	/* send state */
+	struct nvme_tcp_request *request;
+
+	int			queue_size;
+	size_t			cmnd_capsule_len;
+	struct nvme_tcp_ctrl	*ctrl;
+	unsigned long		flags;
+	bool			rd_enabled;
+
+	bool			hdr_digest;
+	bool			data_digest;
+	struct ahash_request	*rcv_hash;
+	struct ahash_request	*snd_hash;
+	__le32			exp_ddgst;
+	__le32			recv_ddgst;
+
+	struct page_frag_cache	pf_cache;
+
+	void (*sc)(struct sock *);
+	void (*dr)(struct sock *);
+	void (*ws)(struct sock *);
+};
+
+struct nvme_tcp_ctrl {
+	/* read only in the hot path */
+	struct nvme_tcp_queue	*queues;
+	struct blk_mq_tag_set	tag_set;
+
+	/* other member variables */
+	struct list_head	list;
+	struct blk_mq_tag_set	admin_tag_set;
+	struct sockaddr_storage addr;
+	struct sockaddr_storage src_addr;
+	struct nvme_ctrl	ctrl;
+
+	struct work_struct	err_work;
+	struct delayed_work	connect_work;
+	struct nvme_tcp_request async_req;
+};
+
+static LIST_HEAD(nvme_tcp_ctrl_list);
+static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
+static struct workqueue_struct *nvme_tcp_wq;
+static struct blk_mq_ops nvme_tcp_mq_ops;
+static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+
+static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
+}
+
+static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
+{
+	return queue - queue->ctrl->queues;
+}
+
+static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
+{
+	u32 queue_idx = nvme_tcp_queue_id(queue);
+
+	if (queue_idx == 0)
+		return queue->ctrl->admin_tag_set.tags[queue_idx];
+	return queue->ctrl->tag_set.tags[queue_idx - 1];
+}
+
+static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
+{
+	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
+{
+	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
+{
+	return queue->cmnd_capsule_len - sizeof(struct nvme_command);
+}
+
+static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
+{
+	return req == &req->queue->ctrl->async_req;
+}
+
+static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
+{
+	struct request *rq;
+	unsigned int bytes;
+
+	if (unlikely(nvme_tcp_async_req(req)))
+		return false; /* async events don't have a request */
+
+	rq = blk_mq_rq_from_pdu(req);
+	bytes = blk_rq_payload_bytes(rq);
+
+	return rq_data_dir(rq) == WRITE && bytes &&
+		bytes <= nvme_tcp_inline_data_size(req->queue);
+}
+
+static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
+{
+	return req->iter.bvec->bv_page;
+}
+
+static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
+{
+	return req->iter.bvec->bv_offset + req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
+{
+	return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
+			req->pdu_len - req->pdu_sent);
+}
+
+static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
+{
+	return req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
+{
+	return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
+			req->pdu_len - req->pdu_sent : 0;
+}
+
+static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
+		int len)
+{
+	return nvme_tcp_pdu_data_left(req) <= len;
+}
+
+static void nvme_tcp_init_send_iter(struct nvme_tcp_request *req)
+{
+	struct request *rq = blk_mq_rq_from_pdu(req);
+	struct bio_vec *vec;
+	unsigned int size;
+	int nsegs;
+	size_t offset;
+
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		vec = &rq->special_vec;
+		nsegs = 1;
+		size = blk_rq_payload_bytes(rq);
+		offset = 0;
+	} else {
+		struct bio *bio = req->curr_bio;
+
+		vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+		nsegs = bio_segments(bio);
+		size = bio->bi_iter.bi_size;
+		offset = bio->bi_iter.bi_bvec_done;
+	}
+
+	iov_iter_bvec(&req->iter, WRITE, vec, nsegs, size);
+	req->iter.iov_offset = offset;
+}
+
+static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
+		int len)
+{
+	req->data_sent += len;
+	req->pdu_sent += len;
+	iov_iter_advance(&req->iter, len);
+	if (!iov_iter_count(&req->iter) &&
+	    req->data_sent < req->data_len) {
+		req->curr_bio = req->curr_bio->bi_next;
+		nvme_tcp_init_send_iter(req);
+	}
+}
+
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+
+	spin_lock(&queue->lock);
+	list_add_tail(&req->entry, &queue->send_list);
+	spin_unlock(&queue->lock);
+
+	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static inline struct nvme_tcp_request *
+nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_request *req;
+
+	spin_lock(&queue->lock);
+	req = list_first_entry_or_null(&queue->send_list,
+			struct nvme_tcp_request, entry);
+	if (req)
+		list_del(&req->entry);
+	spin_unlock(&queue->lock);
+
+	return req;
+}
+
+static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, u32 *dgst)
+{
+	ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
+	crypto_ahash_final(hash);
+}
+
+static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
+		struct page *page, off_t off, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_marker(&sg, 1);
+	sg_set_page(&sg, page, len, off);
+	ahash_request_set_crypt(hash, &sg, NULL, len);
+	crypto_ahash_update(hash);
+}
+
+static inline void nvme_tcp_hdgst(struct ahash_request *hash,
+		void *pdu, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, pdu, len);
+	ahash_request_set_crypt(hash, &sg, pdu + len, len);
+	crypto_ahash_digest(hash);
+}
+
+static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
+	void *pdu, size_t pdu_len)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	__le32 recv_digest;
+	__le32 exp_digest;
+
+	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d: header digest flag is cleared\n",
+			nvme_tcp_queue_id(queue));
+		return -EPROTO;
+	}
+
+	recv_digest = *(__le32 *)(pdu + hdr->hlen);
+	nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
+	exp_digest = *(__le32 *)(pdu + hdr->hlen);
+	if (recv_digest != exp_digest) {
+		dev_err(queue->ctrl->ctrl.device,
+			"header digest error: recv %#x expected %#x\n",
+			le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	u32 len;
+
+	len = le32_to_cpu(hdr->plen) - hdr->hlen -
+		((hdr->flags & NVME_TCP_F_HDGST) ? nvme_tcp_hdgst_len(queue) : 0);
+
+	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d: data digest flag is cleared\n",
+		nvme_tcp_queue_id(queue));
+		return -EPROTO;
+	}
+	crypto_ahash_init(queue->rcv_hash);
+
+	return 0;
+}
+
+static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+	page_frag_free(req->pdu);
+}
+
+static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx,
+		unsigned int numa_node)
+{
+	struct nvme_tcp_ctrl *ctrl = set->driver_data;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
+	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	req->pdu = page_frag_alloc(&queue->pf_cache,
+		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+		GFP_KERNEL | __GFP_ZERO);
+	if (!req->pdu)
+		return -ENOMEM;
+
+	req->queue = queue;
+	nvme_req(rq)->ctrl = &ctrl->ctrl;
+
+	return 0;
+}
+
+static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_tcp_ctrl *ctrl = data;
+	struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_tcp_ctrl *ctrl = data;
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static enum nvme_tcp_recv_state nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
+{
+	return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
+		(queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
+		NVME_TCP_RECV_DATA;
+}
+
+static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
+{
+	queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
+				nvme_tcp_hdgst_len(queue);
+	queue->pdu_offset = 0;
+	queue->data_remaining = -1;
+	queue->ddgst_remaining = 0;
+}
+
+void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		return;
+
+	queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
+}
+
+static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
+		struct nvme_completion *cqe)
+{
+	struct request *rq;
+	struct nvme_tcp_request *req;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag 0x%x not found\n",
+			nvme_tcp_queue_id(queue), cqe->command_id);
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		return -EINVAL;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	nvme_end_request(rq, cqe->status, cqe->result);
+
+	return 0;
+}
+
+static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_data_pdu *pdu)
+{
+	struct nvme_tcp_request *req;
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	if (!blk_rq_payload_bytes(rq)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x unexpected data\n",
+			nvme_tcp_queue_id(queue), rq->tag);
+		return -EIO;
+	}
+
+	queue->data_remaining = le32_to_cpu(pdu->data_length);
+	/* No support for out-of-order */
+	WARN_ON(le32_to_cpu(pdu->data_offset));
+
+	return 0;
+
+}
+
+static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_rsp_pdu *pdu)
+{
+	struct nvme_completion *cqe = &pdu->cqe;
+	int ret = 0;
+
+	/*
+	 * AEN requests are special as they don't time out and can
+	 * survive any kind of queue freeze and often don't respond to
+	 * aborts.  We don't even bother to allocate a struct request
+	 * for them but rather special case them here.
+	 */
+	if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
+	    cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
+				&cqe->result);
+	else
+		ret = nvme_tcp_process_nvme_cqe(queue, cqe);
+
+	return ret;
+}
+
+static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
+		struct nvme_tcp_r2t_pdu *pdu)
+{
+	struct nvme_tcp_data_pdu *data = req->pdu;
+	struct nvme_tcp_queue *queue = req->queue;
+	struct request *rq = blk_mq_rq_from_pdu(req);
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	u8 ddgst = nvme_tcp_ddgst_len(queue);
+
+	req->pdu_len = le32_to_cpu(pdu->r2t_length);
+	req->pdu_sent = 0;
+
+	if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"req %d r2t length %u exceeded data length %u (%zu sent)\n",
+			rq->tag, req->pdu_len, req->data_len,
+			req->data_sent);
+		return -EPROTO;
+	}
+
+	if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"req %d unexpected r2t offset %u (expected %zu)\n",
+			rq->tag, le32_to_cpu(pdu->r2t_offset),
+			req->data_sent);
+		return -EPROTO;
+	}
+
+	memset(data, 0, sizeof(*data));
+	data->hdr.type = nvme_tcp_h2c_data;
+	data->hdr.flags = NVME_TCP_F_DATA_LAST;
+	if (queue->hdr_digest)
+		data->hdr.flags |= NVME_TCP_F_HDGST;
+	if (queue->data_digest)
+		data->hdr.flags |= NVME_TCP_F_DDGST;
+	data->hdr.hlen = sizeof(*data);
+	data->hdr.pdo = data->hdr.hlen + hdgst;
+	data->hdr.plen =
+		cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
+	data->ttag = pdu->ttag;
+	data->command_id = rq->tag;
+	data->data_offset = cpu_to_le32(req->data_sent);
+	data->data_length = cpu_to_le32(req->pdu_len);
+	return 0;
+}
+
+static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_r2t_pdu *pdu)
+{
+	struct nvme_tcp_request *req;
+	struct request *rq;
+	int ret;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
+	if (unlikely(ret))
+		return ret;
+
+	req->state = NVME_TCP_SEND_H2C_PDU;
+	req->offset = 0;
+
+	nvme_tcp_queue_request(req);
+
+	return 0;
+}
+
+static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+		unsigned int *offset, size_t *len)
+{
+	struct nvme_tcp_hdr *hdr;
+	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
+	int ret;
+
+	ret = skb_copy_bits(skb, *offset,
+		&queue->pdu[queue->pdu_offset], rcv_len);
+	if (unlikely(ret))
+		return ret;
+
+	queue->pdu_remaining -= rcv_len;
+	queue->pdu_offset += rcv_len;
+	*offset += rcv_len;
+	*len -= rcv_len;
+	if (queue->pdu_remaining)
+		return 0;
+
+	hdr = (void *)queue->pdu;
+	if (queue->hdr_digest) {
+		ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
+		if (unlikely(ret))
+			return ret;
+	}
+
+
+	if (queue->data_digest) {
+		ret = nvme_tcp_check_ddgst(queue, queue->pdu);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	switch (hdr->type) {
+	case nvme_tcp_c2h_data:
+		ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
+		break;
+	case nvme_tcp_rsp:
+		nvme_tcp_init_recv_ctx(queue);
+		ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
+		break;
+	case nvme_tcp_r2t:
+		nvme_tcp_init_recv_ctx(queue);
+		ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
+		break;
+	default:
+		dev_err(queue->ctrl->ctrl.device,
+			"unsupported pdu type (%d)\n", hdr->type);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static void nvme_tcp_init_recv_iter(struct nvme_tcp_request *req)
+{
+	struct bio *bio = req->curr_bio;
+	struct bio_vec *vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+	unsigned int nsegs = bio_segments(bio);
+
+	iov_iter_bvec(&req->iter, READ, vec, nsegs,
+		bio->bi_iter.bi_size);
+	req->iter.iov_offset = bio->bi_iter.bi_bvec_done;
+}
+
+static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+			      unsigned int *offset, size_t *len)
+{
+	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
+	struct nvme_tcp_request *req;
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	while (true) {
+		int recv_len, ret;
+
+		recv_len = min_t(size_t, *len, queue->data_remaining);
+		if (!recv_len)
+			break;
+
+		/*
+		 * FIXME: This assumes that data comes in-order,
+		 *  need to handle the out-of-order case.
+		 */
+		if (!iov_iter_count(&req->iter)) {
+			req->curr_bio = req->curr_bio->bi_next;
+
+			/*
+			 * If we don`t have any bios it means that controller
+			 * sent more data than we requested, hence error
+			 */
+			if (!req->curr_bio) {
+				dev_err(queue->ctrl->ctrl.device,
+					"queue %d no space in request %#x",
+					nvme_tcp_queue_id(queue), rq->tag);
+				nvme_tcp_init_recv_ctx(queue);
+				return -EIO;
+			}
+			nvme_tcp_init_recv_iter(req);
+		}
+
+		/* we can read only from what is left in this bio */
+		recv_len = min_t(size_t, recv_len,
+				iov_iter_count(&req->iter));
+
+		if (queue->data_digest)
+			ret = skb_copy_and_hash_datagram_iter(skb, *offset,
+				&req->iter, recv_len, queue->rcv_hash);
+		else
+			ret = skb_copy_datagram_iter(skb, *offset,
+					&req->iter, recv_len);
+		if (ret) {
+			dev_err(queue->ctrl->ctrl.device,
+				"queue %d failed to copy request %#x data",
+				nvme_tcp_queue_id(queue), rq->tag);
+			return ret;
+		}
+
+		*len -= recv_len;
+		*offset += recv_len;
+		queue->data_remaining -= recv_len;
+	}
+
+	if (!queue->data_remaining) {
+		if (queue->data_digest) {
+			nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
+			queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
+		} else {
+			nvme_tcp_init_recv_ctx(queue);
+		}
+	}
+
+	return 0;
+}
+
+static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
+		struct sk_buff *skb, unsigned int *offset, size_t *len)
+{
+	char *ddgst = (char *)&queue->recv_ddgst;
+	size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
+	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
+	int ret;
+
+	ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
+	if (unlikely(ret))
+		return ret;
+
+	queue->ddgst_remaining -= recv_len;
+	*offset += recv_len;
+	*len -= recv_len;
+	if (queue->ddgst_remaining)
+		return 0;
+
+	if (queue->recv_ddgst != queue->exp_ddgst) {
+		dev_err(queue->ctrl->ctrl.device,
+			"data digest error: recv %#x expected %#x\n",
+			le32_to_cpu(queue->recv_ddgst),
+			le32_to_cpu(queue->exp_ddgst));
+		return -EIO;
+	}
+
+	nvme_tcp_init_recv_ctx(queue);
+	return 0;
+}
+
+static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+			     unsigned int offset, size_t len)
+{
+	struct nvme_tcp_queue *queue = desc->arg.data;
+	size_t consumed = len;
+	int result;
+
+	while (len) {
+		switch (nvme_tcp_recv_state(queue)) {
+		case NVME_TCP_RECV_PDU:
+			result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
+			break;
+		case NVME_TCP_RECV_DATA:
+			result = nvme_tcp_recv_data(queue, skb, &offset, &len);
+			break;
+		case NVME_TCP_RECV_DDGST:
+			result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
+			break;
+		default:
+			result = -EFAULT;
+		}
+		if (result) {
+			dev_err(queue->ctrl->ctrl.device,
+				"receive failed:  %d\n", result);
+			queue->rd_enabled = false;
+			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+			return result;
+		}
+	}
+
+	return consumed;
+}
+
+static void nvme_tcp_data_ready(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (unlikely(!queue || !queue->rd_enabled))
+		goto done;
+
+	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+done:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_write_space(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+
+	if (!queue)
+		goto done;
+
+	if (sk_stream_is_writeable(sk)) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+	}
+done:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_state_change(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (!queue)
+		goto done;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+	case TCP_LAST_ACK:
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+		/* fallthrough */
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		break;
+	default:
+		dev_info(queue->ctrl->ctrl.device,
+			"queue %d socket state %d\n",
+			nvme_tcp_queue_id(queue), sk->sk_state);
+	}
+
+	queue->sc(sk);
+done:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
+{
+	queue->request = NULL;
+}
+
+static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
+{
+	union nvme_result res = {};
+
+	nvme_end_request(blk_mq_rq_from_pdu(req),
+		NVME_SC_DATA_XFER_ERROR, res);
+}
+
+static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+
+	while (true) {
+		struct page *page = nvme_tcp_req_cur_page(req);
+		size_t offset = nvme_tcp_req_cur_offset(req);
+		size_t len = nvme_tcp_req_cur_length(req);
+		bool last = nvme_tcp_pdu_last_send(req, len);
+		int ret, flags = MSG_DONTWAIT;
+
+		if (last && !queue->data_digest)
+			flags |= MSG_EOR;
+		else
+			flags |= MSG_MORE;
+
+		ret = kernel_sendpage(queue->sock, page, offset, len, flags);
+		if (ret <= 0)
+			return ret;
+
+		nvme_tcp_advance_req(req, ret);
+		if (queue->data_digest)
+			nvme_tcp_ddgst_update(queue->snd_hash, page, offset, ret);
+
+		/* fully successful last write*/
+		if (last && ret == len) {
+			if (queue->data_digest) {
+				nvme_tcp_ddgst_final(queue->snd_hash,
+					&req->ddgst);
+				req->state = NVME_TCP_SEND_DDGST;
+				req->offset = 0;
+			} else {
+				nvme_tcp_done_send_req(queue);
+			}
+			return 1;
+		}
+	}
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	bool inline_data = nvme_tcp_has_inline_data(req);
+	int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	int len = sizeof(*pdu) + hdgst - req->offset;
+	int ret;
+
+	if (queue->hdr_digest && !req->offset)
+		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+			offset_in_page(pdu) + req->offset, len,  flags);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	len -= ret;
+	if (!len) {
+		if (inline_data) {
+			req->state = NVME_TCP_SEND_DATA;
+			if (queue->data_digest)
+				crypto_ahash_init(queue->snd_hash);
+			nvme_tcp_init_send_iter(req);
+		} else {
+			nvme_tcp_done_send_req(queue);
+		}
+		return 1;
+	}
+	req->offset += ret;
+
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	struct nvme_tcp_data_pdu *pdu = req->pdu;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	int len = sizeof(*pdu) - req->offset + hdgst;
+	int ret;
+
+	if (queue->hdr_digest && !req->offset)
+		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+			offset_in_page(pdu) + req->offset, len,
+			MSG_DONTWAIT | MSG_MORE);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	len -= ret;
+	if (!len) {
+		req->state = NVME_TCP_SEND_DATA;
+		if (queue->data_digest)
+			crypto_ahash_init(queue->snd_hash);
+		if (!req->data_sent)
+			nvme_tcp_init_send_iter(req);
+		return 1;
+	}
+	req->offset += ret;
+
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	int ret;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+	struct kvec iov = {
+		.iov_base = &req->ddgst + req->offset,
+		.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
+	};
+
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
+		nvme_tcp_done_send_req(queue);
+		return 1;
+	}
+
+	req->offset += ret;
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_request *req;
+	int ret = 1;
+
+	if (!queue->request) {
+		queue->request = nvme_tcp_fetch_request(queue);
+		if (!queue->request)
+			return 0;
+	}
+	req = queue->request;
+
+	if (req->state == NVME_TCP_SEND_CMD_PDU) {
+		ret = nvme_tcp_try_send_cmd_pdu(req);
+		if (ret <= 0)
+			goto done;
+		if (!nvme_tcp_has_inline_data(req))
+			return ret;
+	}
+
+	if (req->state == NVME_TCP_SEND_H2C_PDU) {
+		ret = nvme_tcp_try_send_data_pdu(req);
+		if (ret <= 0)
+			goto done;
+	}
+
+	if (req->state == NVME_TCP_SEND_DATA) {
+		ret = nvme_tcp_try_send_data(req);
+		if (ret <= 0)
+			goto done;
+	}
+
+	if (req->state == NVME_TCP_SEND_DDGST)
+		ret = nvme_tcp_try_send_ddgst(req);
+done:
+	if (ret == -EAGAIN)
+		ret = 0;
+	return ret;
+}
+
+static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+{
+	struct sock *sk = queue->sock->sk;
+	read_descriptor_t rd_desc;
+	int consumed;
+
+	rd_desc.arg.data = queue;
+	rd_desc.count = 1;
+	lock_sock(sk);
+	consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
+	release_sock(sk);
+	return consumed;
+}
+
+static void nvme_tcp_io_work(struct work_struct *w)
+{
+	struct nvme_tcp_queue *queue =
+		container_of(w, struct nvme_tcp_queue, io_work);
+	unsigned long start = jiffies + msecs_to_jiffies(1);
+
+	do {
+		bool pending = false;
+		int result;
+
+		result = nvme_tcp_try_send(queue);
+		if (result > 0) {
+			pending = true;
+		} else if (unlikely(result < 0)) {
+			dev_err(queue->ctrl->ctrl.device,
+				"failed to send request %d\n", result);
+			if (result != -EPIPE)
+				nvme_tcp_fail_request(queue->request);
+			nvme_tcp_done_send_req(queue);
+			return;
+		}
+
+		result = nvme_tcp_try_recv(queue);
+		if (result > 0)
+			pending = true;
+
+		if (!pending)
+			return;
+
+	} while (time_after(jiffies, start)); /* quota is exhausted */
+
+	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+	ahash_request_free(queue->rcv_hash);
+	ahash_request_free(queue->snd_hash);
+	crypto_free_ahash(tfm);
+}
+
+static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm;
+
+	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->snd_hash)
+		goto free_tfm;
+	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->rcv_hash)
+		goto free_snd_hash;
+	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+	return 0;
+free_snd_hash:
+	ahash_request_free(queue->snd_hash);
+free_tfm:
+	crypto_free_ahash(tfm);
+	return -ENOMEM;
+}
+
+static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+	struct nvme_tcp_request *async = &ctrl->async_req;
+
+	page_frag_free(async->pdu);
+}
+
+static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+	struct nvme_tcp_request *async = &ctrl->async_req;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	async->pdu = page_frag_alloc(&queue->pf_cache,
+		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+		GFP_KERNEL | __GFP_ZERO);
+	if (!async->pdu)
+		return -ENOMEM;
+
+	async->queue = &ctrl->queues[0];
+	return 0;
+}
+
+static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+	if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
+		return;
+
+	if (queue->hdr_digest || queue->data_digest)
+		nvme_tcp_free_crypto(queue);
+
+	sock_release(queue->sock);
+	kfree(queue->pdu);
+}
+
+static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_icreq_pdu *icreq;
+	struct nvme_tcp_icresp_pdu *icresp;
+	struct msghdr msg = {};
+	struct kvec iov;
+	bool ctrl_hdgst, ctrl_ddgst;
+	int ret;
+
+	icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
+	if (!icreq)
+		return -ENOMEM;
+
+	icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
+	if (!icresp) {
+		ret = -ENOMEM;
+		goto free_icreq;
+	}
+
+	icreq->hdr.type = nvme_tcp_icreq;
+	icreq->hdr.hlen = sizeof(*icreq);
+	icreq->hdr.pdo = 0;
+	icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
+	icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+	icreq->maxr2t = 0; /* single inflight r2t supported */
+	icreq->hpda = 0; /* no alignment constraint */
+	if (queue->hdr_digest)
+		icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+	if (queue->data_digest)
+		icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+	iov.iov_base = icreq;
+	iov.iov_len = sizeof(*icreq);
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (ret < 0)
+		goto free_icresp;
+
+	memset(&msg, 0, sizeof(msg));
+	iov.iov_base = icresp;
+	iov.iov_len = sizeof(*icresp);
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (ret < 0)
+		goto free_icresp;
+
+	ret = -EINVAL;
+	if (icresp->hdr.type != nvme_tcp_icresp) {
+		pr_err("queue %d: bad type returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->hdr.type);
+		goto free_icresp;
+	}
+
+	if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
+		pr_err("queue %d: bad pdu length returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->hdr.plen);
+		goto free_icresp;
+	}
+
+	if (icresp->pfv != NVME_TCP_PFV_1_0) {
+		pr_err("queue %d: bad pfv returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->pfv);
+		goto free_icresp;
+	}
+
+	ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+	if ((queue->data_digest && !ctrl_ddgst) ||
+	    (!queue->data_digest && ctrl_ddgst)) {
+		pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
+			nvme_tcp_queue_id(queue),
+			queue->data_digest ? "enabled" : "disabled",
+			ctrl_ddgst ? "enabled" : "disabled");
+		goto free_icresp;
+	}
+
+	ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+	if ((queue->hdr_digest && !ctrl_hdgst) ||
+	    (!queue->hdr_digest && ctrl_hdgst)) {
+		pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
+			nvme_tcp_queue_id(queue),
+			queue->hdr_digest ? "enabled" : "disabled",
+			ctrl_hdgst ? "enabled" : "disabled");
+		goto free_icresp;
+	}
+
+	if (icresp->cpda != 0) {
+		pr_err("queue %d: unsupported cpda returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->cpda);
+		goto free_icresp;
+	}
+
+	ret = 0;
+free_icresp:
+	kfree(icresp);
+free_icreq:
+	kfree(icreq);
+	return ret;
+}
+
+static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
+		int qid, size_t queue_size)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+	int ret, opt, rcv_pdu_size;
+
+	queue->ctrl = ctrl;
+	INIT_LIST_HEAD(&queue->send_list);
+	spin_lock_init(&queue->lock);
+	INIT_WORK(&queue->io_work, nvme_tcp_io_work);
+	queue->queue_size = queue_size;
+
+	if (qid > 0)
+		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+	else
+		queue->cmnd_capsule_len = sizeof(struct nvme_command) +
+						NVME_TCP_ADMIN_CCSZ;
+
+	ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
+			IPPROTO_TCP, &queue->sock);
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to create socket: %d\n", ret);
+		return ret;
+	}
+
+	/* Single syn retry */
+	opt = 1;
+	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
+			(char *)&opt, sizeof(opt));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set TCP_SYNCNT sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	/* Set TCP no delay */
+	opt = 1;
+	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
+			TCP_NODELAY, (char *)&opt, sizeof(opt));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set TCP_NODELAY sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	/*
+	 * Cleanup whatever is sitting in the TCP transmit queue on socket
+	 * close. This is done to prevent stale data from being sent should
+	 * the network connection be restored before TCP times out.
+	 */
+	ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
+			(char *)&sol, sizeof(sol));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set SO_LINGER sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	queue->sock->sk->sk_allocation = GFP_ATOMIC;
+	queue->io_cpu = (qid == 0) ? 0 : qid - 1;
+	queue->request = NULL;
+	queue->data_remaining = 0;
+	queue->ddgst_remaining = 0;
+	queue->pdu_remaining = 0;
+	queue->pdu_offset = 0;
+	sk_set_memalloc(queue->sock->sk);
+
+	if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+		ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
+			sizeof(ctrl->src_addr));
+		if (ret) {
+			dev_err(ctrl->ctrl.device,
+				"failed to bind queue %d socket %d\n",
+				qid, ret);
+			goto err_sock;
+		}
+	}
+
+	queue->hdr_digest = nctrl->opts->hdr_digest;
+	queue->data_digest = nctrl->opts->data_digest;
+	if (queue->hdr_digest || queue->data_digest) {
+		ret = nvme_tcp_alloc_crypto(queue);
+		if (ret) {
+			dev_err(ctrl->ctrl.device,
+				"failed to allocate queue %d crypto\n", qid);
+			goto err_sock;
+		}
+	}
+
+	rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
+			nvme_tcp_hdgst_len(queue);
+	queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
+	if (!queue->pdu) {
+		ret = -ENOMEM;
+		goto err_crypto;
+	}
+
+	dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
+			nvme_tcp_queue_id(queue));
+
+	ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
+		sizeof(ctrl->addr), 0);
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to connect socket: %d\n", ret);
+		goto err_rcv_pdu;
+	}
+
+	ret = nvme_tcp_init_connection(queue);
+	if (ret)
+		goto err_init_connect;
+
+	queue->rd_enabled = true;
+	set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
+	nvme_tcp_init_recv_ctx(queue);
+
+	write_lock_bh(&queue->sock->sk->sk_callback_lock);
+	queue->sock->sk->sk_user_data = queue;
+	queue->sc = queue->sock->sk->sk_state_change;
+	queue->dr = queue->sock->sk->sk_data_ready;
+	queue->ws = queue->sock->sk->sk_write_space;
+	queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
+	queue->sock->sk->sk_state_change = nvme_tcp_state_change;
+	queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
+
+	return 0;
+
+err_init_connect:
+	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+err_rcv_pdu:
+	kfree(queue->pdu);
+err_crypto:
+	if (queue->hdr_digest || queue->data_digest)
+		nvme_tcp_free_crypto(queue);
+err_sock:
+	sock_release(queue->sock);
+	queue->sock = NULL;
+	return ret;
+}
+
+static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data  = NULL;
+	sock->sk->sk_data_ready = queue->dr;
+	sock->sk->sk_state_change = queue->sc;
+	sock->sk->sk_write_space  = queue->ws;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
+{
+	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+	nvme_tcp_restore_sock_calls(queue);
+	cancel_work_sync(&queue->io_work);
+}
+
+static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+	if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
+		return;
+
+	__nvme_tcp_stop_queue(queue);
+}
+
+static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	int ret;
+
+	if (idx)
+		ret = nvmf_connect_io_queue(nctrl, idx);
+	else
+		ret = nvmf_connect_admin_queue(nctrl);
+
+	if (!ret) {
+		set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
+	} else {
+		__nvme_tcp_stop_queue(&ctrl->queues[idx]);
+		dev_err(nctrl->device,
+			"failed to connect queue: %d ret=%d\n", idx, ret);
+	}
+	return ret;
+}
+
+static void nvme_tcp_free_tagset(struct nvme_ctrl *nctrl,
+		struct blk_mq_tag_set *set)
+{
+	blk_mq_free_tag_set(set);
+}
+
+static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
+		bool admin)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct blk_mq_tag_set *set;
+	int ret;
+
+	if (admin) {
+		set = &ctrl->admin_tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_tcp_admin_mq_ops;
+		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+		set->reserved_tags = 2; /* connect + keep-alive */
+		set->numa_node = NUMA_NO_NODE;
+		set->cmd_size = sizeof(struct nvme_tcp_request);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = 1;
+		set->timeout = ADMIN_TIMEOUT;
+	} else {
+		set = &ctrl->tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_tcp_mq_ops;
+		set->queue_depth = nctrl->sqsize + 1;
+		set->reserved_tags = 1; /* fabric connect */
+		set->numa_node = NUMA_NO_NODE;
+		set->flags = BLK_MQ_F_SHOULD_MERGE;
+		set->cmd_size = sizeof(struct nvme_tcp_request);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = nctrl->queue_count - 1;
+		set->timeout = NVME_IO_TIMEOUT;
+	}
+
+	ret = blk_mq_alloc_tag_set(set);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return set;
+}
+
+static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
+{
+	if (to_tcp_ctrl(ctrl)->async_req.pdu) {
+		nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
+		to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
+	}
+
+	nvme_tcp_free_queue(ctrl, 0);
+}
+
+static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_tcp_free_queue(ctrl, i);
+}
+
+static void nvme_tcp_stop_admin_queue(struct nvme_ctrl *ctrl)
+{
+	nvme_tcp_stop_queue(ctrl, 0);
+}
+
+static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_tcp_stop_queue(ctrl, i);
+}
+
+static int nvme_tcp_start_admin_queue(struct nvme_ctrl *ctrl)
+{
+	return nvme_tcp_start_queue(ctrl, 0);
+}
+
+static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i, ret = 0;
+
+	for (i = 1; i < ctrl->queue_count; i++) {
+		ret = nvme_tcp_start_queue(ctrl, i);
+		if (ret)
+			goto out_stop_queues;
+	}
+
+	return 0;
+
+out_stop_queues:
+	for (i--; i >= 1; i--)
+		nvme_tcp_stop_queue(ctrl, i);
+	return ret;
+}
+
+static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
+	if (ret)
+		return ret;
+
+	ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
+	if (ret)
+		goto out_free_queue;
+
+	return 0;
+
+out_free_queue:
+	nvme_tcp_free_queue(ctrl, 0);
+	return ret;
+}
+
+static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i, ret;
+
+	for (i = 1; i < ctrl->queue_count; i++) {
+		ret = nvme_tcp_alloc_queue(ctrl, i,
+				ctrl->sqsize + 1);
+		if (ret)
+			goto out_free_queues;
+	}
+
+	return 0;
+
+out_free_queues:
+	for (i--; i >= 1; i--)
+		nvme_tcp_free_queue(ctrl, i);
+
+	return ret;
+}
+
+static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
+{
+	return min(ctrl->queue_count - 1, num_online_cpus());
+}
+
+static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+	unsigned int nr_io_queues;
+	int ret;
+
+	nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
+	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
+	if (ret)
+		return ret;
+
+	ctrl->queue_count = nr_io_queues + 1;
+	if (ctrl->queue_count < 2)
+		return 0;
+
+	dev_info(ctrl->device,
+		"creating %d I/O queues.\n", nr_io_queues);
+
+	return nvme_tcp_alloc_io_queues(ctrl);
+}
+
+void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
+{
+	nvme_tcp_stop_io_queues(ctrl);
+	if (remove) {
+		if (ctrl->ops->flags & NVME_F_FABRICS)
+			blk_cleanup_queue(ctrl->connect_q);
+		nvme_tcp_free_tagset(ctrl, ctrl->tagset);
+	}
+	nvme_tcp_free_io_queues(ctrl);
+}
+
+int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
+{
+	int ret;
+
+	ret = nvme_alloc_io_queues(ctrl);
+	if (ret)
+		return ret;
+
+	if (new) {
+		ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
+		if (IS_ERR(ctrl->tagset)) {
+			ret = PTR_ERR(ctrl->tagset);
+			goto out_free_io_queues;
+		}
+
+		if (ctrl->ops->flags & NVME_F_FABRICS) {
+			ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
+			if (IS_ERR(ctrl->connect_q)) {
+				ret = PTR_ERR(ctrl->connect_q);
+				goto out_free_tag_set;
+			}
+		}
+	} else {
+		blk_mq_update_nr_hw_queues(ctrl->tagset,
+			ctrl->queue_count - 1);
+	}
+
+	ret = nvme_tcp_start_io_queues(ctrl);
+	if (ret)
+		goto out_cleanup_connect_q;
+
+	return 0;
+
+out_cleanup_connect_q:
+	if (new && (ctrl->ops->flags & NVME_F_FABRICS))
+		blk_cleanup_queue(ctrl->connect_q);
+out_free_tag_set:
+	if (new)
+		nvme_tcp_free_tagset(ctrl, ctrl->tagset);
+out_free_io_queues:
+	nvme_tcp_free_io_queues(ctrl);
+	return ret;
+}
+
+void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
+{
+	nvme_tcp_stop_admin_queue(ctrl);
+	if (remove) {
+		free_opal_dev(ctrl->opal_dev);
+		blk_cleanup_queue(ctrl->admin_q);
+		nvme_tcp_free_tagset(ctrl, ctrl->admin_tagset);
+	}
+	nvme_tcp_free_admin_queue(ctrl);
+}
+
+int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
+{
+	int error;
+
+	error = nvme_tcp_alloc_admin_queue(ctrl);
+	if (error)
+		return error;
+
+	if (new) {
+		ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
+		if (IS_ERR(ctrl->admin_tagset)) {
+			error = PTR_ERR(ctrl->admin_tagset);
+			goto out_free_queue;
+		}
+
+		ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
+		if (IS_ERR(ctrl->admin_q)) {
+			error = PTR_ERR(ctrl->admin_q);
+			goto out_free_tagset;
+		}
+	}
+
+	error = nvme_tcp_start_admin_queue(ctrl);
+	if (error)
+		goto out_cleanup_queue;
+
+	error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+	if (error) {
+		dev_err(ctrl->device,
+			"prop_get NVME_REG_CAP failed\n");
+		goto out_stop_queue;
+	}
+
+	ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
+
+	error = nvme_enable_ctrl(ctrl, ctrl->cap);
+	if (error)
+		goto out_stop_queue;
+
+	error = nvme_init_identify(ctrl);
+	if (error)
+		goto out_stop_queue;
+
+	return 0;
+
+out_stop_queue:
+	nvme_tcp_stop_admin_queue(ctrl);
+out_cleanup_queue:
+	if (new)
+		blk_cleanup_queue(ctrl->admin_q);
+out_free_tagset:
+	if (new)
+		nvme_tcp_free_tagset(ctrl, ctrl->admin_tagset);
+out_free_queue:
+	nvme_tcp_free_admin_queue(ctrl);
+	return error;
+}
+
+static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
+		bool remove)
+{
+	blk_mq_quiesce_queue(ctrl->admin_q);
+	nvme_tcp_stop_admin_queue(ctrl);
+	blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
+	blk_mq_unquiesce_queue(ctrl->admin_q);
+	nvme_tcp_destroy_admin_queue(ctrl, remove);
+}
+
+static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
+		bool remove)
+{
+	if (ctrl->queue_count > 1) {
+		nvme_stop_queues(ctrl);
+		nvme_tcp_stop_io_queues(ctrl);
+		blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
+		if (remove)
+			nvme_start_queues(ctrl);
+		nvme_tcp_destroy_io_queues(ctrl, remove);
+	}
+}
+
+static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
+{
+	/* If we are resetting/deleting then do nothing */
+	if (ctrl->state != NVME_CTRL_CONNECTING) {
+		WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
+			ctrl->state == NVME_CTRL_LIVE);
+		return;
+	}
+
+	if (nvmf_should_reconnect(ctrl)) {
+		dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
+			ctrl->opts->reconnect_delay);
+		queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
+				ctrl->opts->reconnect_delay * HZ);
+	} else {
+		dev_info(ctrl->device, "Removing controller...\n");
+		nvme_delete_ctrl(ctrl);
+	}
+}
+
+static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
+{
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	int ret = -EINVAL;
+
+	ret = nvme_tcp_configure_admin_queue(ctrl, new);
+	if (ret)
+		return ret;
+
+	if (ctrl->icdoff) {
+		dev_err(ctrl->device, "icdoff is not supported!\n");
+		goto destroy_admin;
+	}
+
+	if (opts->queue_size > ctrl->sqsize + 1)
+		dev_warn(ctrl->device,
+			"queue_size %zu > ctrl sqsize %u, clamping down\n",
+			opts->queue_size, ctrl->sqsize + 1);
+
+	if (ctrl->sqsize + 1 > ctrl->maxcmd) {
+		dev_warn(ctrl->device,
+			"sqsize %u > ctrl maxcmd %u, clamping down\n",
+			ctrl->sqsize + 1, ctrl->maxcmd);
+		ctrl->sqsize = ctrl->maxcmd - 1;
+	}
+
+	if (ctrl->queue_count > 1) {
+		ret = nvme_tcp_configure_io_queues(ctrl, new);
+		if (ret)
+			goto destroy_admin;
+	}
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		ret = -EINVAL;
+		goto destroy_io;
+	}
+
+	nvme_start_ctrl(ctrl);
+	return 0;
+
+destroy_io:
+	if (ctrl->queue_count > 1)
+		nvme_tcp_destroy_io_queues(ctrl, new);
+destroy_admin:
+	nvme_tcp_stop_admin_queue(ctrl);
+	nvme_tcp_destroy_admin_queue(ctrl, new);
+	return ret;
+}
+
+static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
+{
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+			struct nvme_tcp_ctrl, connect_work);
+	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+	++ctrl->nr_reconnects;
+
+	if (nvme_tcp_setup_ctrl(ctrl, false))
+		goto requeue;
+
+	dev_info(ctrl->device, "Successfully reconnected (%d attepmpt)\n",
+			ctrl->nr_reconnects);
+
+	ctrl->nr_reconnects = 0;
+
+	return;
+
+requeue:
+	dev_info(ctrl->device, "Failed reconnect attempt %d\n",
+			ctrl->nr_reconnects);
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_error_recovery_work(struct work_struct *work)
+{
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+				struct nvme_tcp_ctrl, err_work);
+	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+	nvme_stop_keep_alive(ctrl);
+	nvme_tcp_teardown_io_queues(ctrl, false);
+	/* unquiesce to fail fast pending requests */
+	nvme_start_queues(ctrl);
+	nvme_tcp_teardown_admin_queue(ctrl, false);
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
+{
+	nvme_tcp_teardown_io_queues(ctrl, shutdown);
+	if (shutdown)
+		nvme_shutdown_ctrl(ctrl);
+	else
+		nvme_disable_ctrl(ctrl, ctrl->cap);
+	nvme_tcp_teardown_admin_queue(ctrl, shutdown);
+}
+
+static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	nvme_tcp_teardown_ctrl(ctrl, true);
+}
+
+static void nvme_reset_ctrl_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, reset_work);
+
+	nvme_stop_ctrl(ctrl);
+	nvme_tcp_teardown_ctrl(ctrl, false);
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	if (nvme_tcp_setup_ctrl(ctrl, false))
+		goto out_fail;
+
+	return;
+
+out_fail:
+	++ctrl->nr_reconnects;
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+}
+
+static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+
+	if (list_empty(&ctrl->list))
+		goto free_ctrl;
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_del(&ctrl->list);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	nvmf_free_options(nctrl->opts);
+free_ctrl:
+	kfree(ctrl->queues);
+	kfree(ctrl);
+}
+
+static void nvme_tcp_set_sg_null(struct nvme_command *c)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = 0;
+	sg->length = 0;
+	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+			NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_request *req, struct nvme_command *c)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
+	sg->length = cpu_to_le32(req->data_len);
+	sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
+}
+
+static void nvme_tcp_set_sg_host_data(struct nvme_tcp_request *req,
+		struct nvme_command *c)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = 0;
+	sg->length = cpu_to_le32(req->data_len);
+	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+			NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+	struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
+	struct nvme_command *cmd = &pdu->cmd;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	memset(pdu, 0, sizeof(*pdu));
+	pdu->hdr.type = nvme_tcp_cmd;
+	if (queue->hdr_digest)
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+	cmd->common.opcode = nvme_admin_async_event;
+	cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+	cmd->common.flags |= NVME_CMD_SGL_METABUF;
+	nvme_tcp_set_sg_null(cmd);
+
+	ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
+	ctrl->async_req.offset = 0;
+	ctrl->async_req.curr_bio = NULL;
+	ctrl->async_req.data_len = 0;
+
+	nvme_tcp_queue_request(&ctrl->async_req);
+}
+
+static enum blk_eh_timer_return
+nvme_tcp_timeout(struct request *rq, bool reserved)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+
+	dev_dbg(ctrl->ctrl.device,
+		"queue %d: timeout request %#x type %d\n",
+		nvme_tcp_queue_id(req->queue), rq->tag,
+		pdu->hdr.type);
+
+	if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+		union nvme_result res = {};
+
+		nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
+		nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
+		return BLK_EH_DONE;
+	}
+
+	/* queue error recovery */
+	nvme_tcp_error_recovery(&ctrl->ctrl);
+
+	return BLK_EH_RESET_TIMER;
+}
+
+static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
+			struct request *rq)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	struct nvme_command *c = &pdu->cmd;
+
+	c->common.flags |= NVME_CMD_SGL_METABUF;
+
+	if (!req->data_len) {
+		nvme_tcp_set_sg_null(c);
+		return 0;
+	}
+
+	if (rq_data_dir(rq) == WRITE &&
+	    req->data_len <= nvme_tcp_inline_data_size(queue))
+		nvme_tcp_set_sg_inline(queue, req, c);
+	else
+		nvme_tcp_set_sg_host_data(req, c);
+
+	return 0;
+}
+
+static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
+		struct request *rq)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	struct nvme_tcp_queue *queue = req->queue;
+	u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
+	blk_status_t ret;
+
+	ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
+	if (ret)
+		return ret;
+
+	req->state = NVME_TCP_SEND_CMD_PDU;
+	req->offset = 0;
+	req->data_sent = 0;
+	req->pdu_len = 0;
+	req->pdu_sent = 0;
+	req->data_len = blk_rq_payload_bytes(rq);
+
+	if (rq_data_dir(rq) == WRITE) {
+		req->curr_bio = rq->bio;
+		if (req->data_len <= nvme_tcp_inline_data_size(queue))
+			req->pdu_len = req->data_len;
+	} else {
+		req->curr_bio = rq->bio;
+		if (req->curr_bio)
+			nvme_tcp_init_recv_iter(req);
+	}
+
+	pdu->hdr.type = nvme_tcp_cmd;
+	pdu->hdr.flags = 0;
+	if (queue->hdr_digest)
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+	if (queue->data_digest && req->pdu_len) {
+		pdu->hdr.flags |= NVME_TCP_F_DDGST;
+		ddgst = nvme_tcp_ddgst_len(queue);
+	}
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
+	pdu->hdr.plen =
+		cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
+
+	ret = nvme_tcp_map_data(queue, rq);
+	if (unlikely(ret)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"Failed to map data (%d)\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_tcp_queue *queue = hctx->driver_data;
+	struct request *rq = bd->rq;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
+	blk_status_t ret;
+
+	if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+		return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+
+	ret = nvme_tcp_setup_cmd_pdu(ns, rq);
+	if (unlikely(ret))
+		return ret;
+
+	blk_mq_start_request(rq);
+
+	nvme_tcp_queue_request(req);
+
+	return BLK_STS_OK;
+}
+
+static struct blk_mq_ops nvme_tcp_mq_ops = {
+	.queue_rq	= nvme_tcp_queue_rq,
+	.complete	= nvme_complete_rq,
+	.init_request	= nvme_tcp_init_request,
+	.exit_request	= nvme_tcp_exit_request,
+	.init_hctx	= nvme_tcp_init_hctx,
+	.timeout	= nvme_tcp_timeout,
+};
+
+static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
+	.queue_rq	= nvme_tcp_queue_rq,
+	.complete	= nvme_complete_rq,
+	.init_request	= nvme_tcp_init_request,
+	.exit_request	= nvme_tcp_exit_request,
+	.init_hctx	= nvme_tcp_init_admin_hctx,
+	.timeout	= nvme_tcp_timeout,
+};
+
+static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
+	.name			= "tcp",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_FABRICS,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.free_ctrl		= nvme_tcp_free_ctrl,
+	.submit_async_event	= nvme_tcp_submit_async_event,
+	.delete_ctrl		= nvme_tcp_delete_ctrl,
+	.get_address		= nvmf_get_address,
+	.stop_ctrl		= nvme_tcp_stop_ctrl,
+};
+
+static bool
+nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
+{
+	struct nvme_tcp_ctrl *ctrl;
+	bool found = false;
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
+		found = nvmf_ip_options_match(&ctrl->ctrl, opts);
+		if (found)
+			break;
+	}
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	return found;
+}
+
+static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
+		struct nvmf_ctrl_options *opts)
+{
+	struct nvme_tcp_ctrl *ctrl;
+	int ret;
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ctrl->list);
+	ctrl->ctrl.opts = opts;
+	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+	ctrl->ctrl.sqsize = opts->queue_size - 1;
+	ctrl->ctrl.kato = opts->kato;
+
+	INIT_DELAYED_WORK(&ctrl->connect_work,
+			nvme_tcp_reconnect_ctrl_work);
+	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
+	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
+
+	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
+		opts->trsvcid =
+			kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
+		if (!opts->trsvcid) {
+			ret = -ENOMEM;
+			goto out_free_ctrl;
+		}
+		opts->mask |= NVMF_OPT_TRSVCID;
+	}
+
+	ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+			opts->traddr, opts->trsvcid, &ctrl->addr);
+	if (ret) {
+		pr_err("malformed address passed: %s:%s\n",
+			opts->traddr, opts->trsvcid);
+		goto out_free_ctrl;
+	}
+
+	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+		ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+			opts->host_traddr, NULL, &ctrl->src_addr);
+		if (ret) {
+			pr_err("malformed src address passed: %s\n",
+			       opts->host_traddr);
+			goto out_free_ctrl;
+		}
+	}
+
+	if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
+		ret = -EALREADY;
+		goto out_free_ctrl;
+	}
+
+	ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
+				GFP_KERNEL);
+	if (!ctrl->queues) {
+		ret = -ENOMEM;
+		goto out_free_ctrl;
+	}
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
+	if (ret)
+		goto out_kfree_queues;
+
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+		WARN_ON_ONCE(1);
+		ret = -EINTR;
+		goto out_uninit_ctrl;
+	}
+
+	ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
+	if (ret)
+		goto out_uninit_ctrl;
+
+	dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+		ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+
+	nvme_get_ctrl(&ctrl->ctrl);
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	return &ctrl->ctrl;
+
+out_uninit_ctrl:
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+	if (ret > 0)
+		ret = -EIO;
+	return ERR_PTR(ret);
+out_kfree_queues:
+	kfree(ctrl->queues);
+out_free_ctrl:
+	kfree(ctrl);
+	return ERR_PTR(ret);
+}
+
+static struct nvmf_transport_ops nvme_tcp_transport = {
+	.name		= "tcp",
+	.module		= THIS_MODULE,
+	.required_opts	= NVMF_OPT_TRADDR,
+	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
+			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST,
+	.create_ctrl	= nvme_tcp_create_ctrl,
+};
+
+static int __init nvme_tcp_init_module(void)
+{
+	nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
+			WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+	if (!nvme_tcp_wq)
+		return -ENOMEM;
+
+	nvmf_register_transport(&nvme_tcp_transport);
+	return 0;
+}
+
+static void __exit nvme_tcp_cleanup_module(void)
+{
+	struct nvme_tcp_ctrl *ctrl;
+
+	nvmf_unregister_transport(&nvme_tcp_transport);
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
+		nvme_delete_ctrl(&ctrl->ctrl);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+	flush_workqueue(nvme_delete_wq);
+
+	destroy_workqueue(nvme_tcp_wq);
+}
+
+module_init(nvme_tcp_init_module);
+module_exit(nvme_tcp_cleanup_module);
+
+MODULE_LICENSE("GPL v2");