[v1,net-next,05/15] nvme-tcp: Add DDP offload control path

Message ID	20201207210649.19194-6-borisp@mellanox.com (mailing list archive)
State	Changes Requested
Delegated to:	Netdev Maintainers
Headers	show Return-Path: <netdev-owner@kernel.org> From: Boris Pismenny <borisp@mellanox.com> To: kuba@kernel.org, davem@davemloft.net, saeedm@nvidia.com, hch@lst.de, sagi@grimberg.me, axboe@fb.com, kbusch@kernel.org, viro@zeniv.linux.org.uk, edumazet@google.com Cc: boris.pismenny@gmail.com, linux-nvme@lists.infradead.org, netdev@vger.kernel.org, benishay@nvidia.com, ogerlitz@nvidia.com, yorayz@nvidia.com, Ben Ben-Ishay <benishay@mellanox.com>, Or Gerlitz <ogerlitz@mellanox.com>, Yoray Zack <yorayz@mellanox.com> Subject: [PATCH v1 net-next 05/15] nvme-tcp: Add DDP offload control path Date: Mon, 7 Dec 2020 23:06:39 +0200 Message-Id: <20201207210649.19194-6-borisp@mellanox.com> In-Reply-To: <20201207210649.19194-1-borisp@mellanox.com> References: <20201207210649.19194-1-borisp@mellanox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	nvme-tcp receive offloads \| expand [v1,net-next,00/15] nvme-tcp receive offloads [v1,net-next,01/15] iov_iter: Skip copy in memcpy_to_page if src==dst [v1,net-next,02/15] net: Introduce direct data placement tcp offload [v1,net-next,03/15] net: Introduce crc offload for tcp ddp ulp [v1,net-next,04/15] net/tls: expose get_netdev_for_sock [v1,net-next,05/15] nvme-tcp: Add DDP offload control path [v1,net-next,06/15] nvme-tcp: Add DDP data-path [v1,net-next,07/15] nvme-tcp : Recalculate crc in the end of the capsule [v1,net-next,08/15] nvme-tcp: Deal with netdevice DOWN events [v1,net-next,09/15] net/mlx5: Header file changes for nvme-tcp offload [v1,net-next,10/15] net/mlx5: Add 128B CQE for NVMEoTCP offload [v1,net-next,11/15] net/mlx5e: TCP flow steering for nvme-tcp [v1,net-next,12/15] net/mlx5e: NVMEoTCP DDP offload control path [v1,net-next,13/15] net/mlx5e: NVMEoTCP, data-path for DDP offload [v1,net-next,14/15] net/mlx5e: NVMEoTCP statistics [v1,net-next,15/15] net/mlx5e: NVMEoTCP workaround CRC after resync

Context	Check	Description
netdev/apply	fail	Patch does not apply to net-next
netdev/tree_selection	success	Clearly marked for net-next

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c0c33320fe65..ef96e4a02bbd 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -14,6 +14,7 @@ #include <linux/blk-mq.h> #include <crypto/hash.h> #include <net/busy_poll.h> +#include <net/tcp_ddp.h> #include "nvme.h" #include "fabrics.h" @@ -62,6 +63,7 @@ enum nvme_tcp_queue_flags { NVME_TCP_Q_ALLOCATED = 0, NVME_TCP_Q_LIVE = 1, NVME_TCP_Q_POLLING = 2, + NVME_TCP_Q_OFFLOADS = 3, }; enum nvme_tcp_recv_state { @@ -110,6 +112,8 @@ struct nvme_tcp_queue { void (*state_change)(struct sock *); void (*data_ready)(struct sock *); void (*write_space)(struct sock *); + + atomic64_t resync_req; }; struct nvme_tcp_ctrl { @@ -128,6 +132,8 @@ struct nvme_tcp_ctrl { struct delayed_work connect_work; struct nvme_tcp_request async_req; u32 io_queues[HCTX_MAX_TYPES]; + + struct net_device *offloading_netdev; }; static LIST_HEAD(nvme_tcp_ctrl_list); @@ -222,6 +228,180 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, return nvme_tcp_pdu_data_left(req) <= len; } +#ifdef CONFIG_TCP_DDP + +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); +const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { + .resync_request = nvme_tcp_resync_request, +}; + +static +int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true); + struct nvme_tcp_ddp_config config = {}; + int ret; + + if (!netdev) { + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); + return -ENODEV; + } + + if (!(netdev->features & NETIF_F_HW_TCP_DDP)) { + dev_put(netdev); + return -EOPNOTSUPP; + } + + config.cfg.type = TCP_DDP_NVME; + config.pfv = NVME_TCP_PFV_1_0; + config.cpda = 0; + config.dgst = queue->hdr_digest ? + NVME_TCP_HDR_DIGEST_ENABLE : 0; + config.dgst |= queue->data_digest ? + NVME_TCP_DATA_DIGEST_ENABLE : 0; + config.queue_size = queue->queue_size; + config.queue_id = nvme_tcp_queue_id(queue); + config.io_cpu = queue->io_cpu; + + ret = netdev->tcp_ddp_ops->tcp_ddp_sk_add(netdev, + queue->sock->sk, + (struct tcp_ddp_config *)&config); + if (ret) { + dev_put(netdev); + return ret; + } + + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops; + if (netdev->features & NETIF_F_HW_TCP_DDP) + set_bit(NVME_TCP_Q_OFFLOADS, &queue->flags); + + return ret; +} + +static +void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = queue->ctrl->offloading_netdev; + + if (!netdev) { + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); + return; + } + + netdev->tcp_ddp_ops->tcp_ddp_sk_del(netdev, queue->sock->sk); + + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL; + dev_put(netdev); /* put the queue_init get_netdev_for_sock() */ +} + +static +int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true); + struct tcp_ddp_limits limits; + int ret = 0; + + if (!netdev) { + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); + return -ENODEV; + } + + if (netdev->features & NETIF_F_HW_TCP_DDP && + netdev->tcp_ddp_ops && + netdev->tcp_ddp_ops->tcp_ddp_limits) + ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, &limits); + else + ret = -EOPNOTSUPP; + + if (!ret) { + queue->ctrl->offloading_netdev = netdev; + dev_dbg_ratelimited(queue->ctrl->ctrl.device, + "netdev %s offload limits: max_ddp_sgl_len %d\n", + netdev->name, limits.max_ddp_sgl_len); + queue->ctrl->ctrl.max_segments = limits.max_ddp_sgl_len; + queue->ctrl->ctrl.max_hw_sectors = + limits.max_ddp_sgl_len << (ilog2(SZ_4K) - 9); + } else { + queue->ctrl->offloading_netdev = NULL; + } + + dev_put(netdev); + + return ret; +} + +static +void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + unsigned int pdu_seq) +{ + struct net_device *netdev = queue->ctrl->offloading_netdev; + u64 resync_val; + u32 resync_seq; + + resync_val = atomic64_read(&queue->resync_req); + /* Lower 32 bit flags. Check validity of the request */ + if ((resync_val & TCP_DDP_RESYNC_REQ) == 0) + return; + + /* Obtain and check requested sequence number: is this PDU header before the request? */ + resync_seq = resync_val >> 32; + if (before(pdu_seq, resync_seq)) + return; + + if (unlikely(!netdev)) { + pr_info_ratelimited("%s: netdev not found\n", __func__); + return; + } + + /** + * The atomic operation gurarantees that we don't miss any NIC driver + * resync requests submitted after the above checks. + */ + if (atomic64_cmpxchg(&queue->resync_req, resync_val, + resync_val & ~TCP_DDP_RESYNC_REQ)) + netdev->tcp_ddp_ops->tcp_ddp_resync(netdev, queue->sock->sk, pdu_seq); +} + +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) +{ + struct nvme_tcp_queue *queue = sk->sk_user_data; + + atomic64_set(&queue->resync_req, + (((uint64_t)seq << 32) | flags)); + + return true; +} + +#else + +static +int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + return -EINVAL; +} + +static +void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{} + +static +int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue) +{ + return -EINVAL; +} + +static +void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + unsigned int pdu_seq) +{} + +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) +{ + return false; +} + +#endif + static void nvme_tcp_init_iter(struct nvme_tcp_request *req, unsigned int dir) { @@ -627,6 +807,11 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); int ret; + u64 pdu_seq = TCP_SKB_CB(skb)->seq + *offset - queue->pdu_offset; + + if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags)) + nvme_tcp_resync_response(queue, pdu_seq); + ret = skb_copy_bits(skb, *offset, &pdu[queue->pdu_offset], rcv_len); if (unlikely(ret)) @@ -1517,6 +1702,9 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) kernel_sock_shutdown(queue->sock, SHUT_RDWR); nvme_tcp_restore_sock_calls(queue); cancel_work_sync(&queue->io_work); + + if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags)) + nvme_tcp_unoffload_socket(queue); } static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) @@ -1534,10 +1722,13 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); int ret; - if (idx) + if (idx) { ret = nvmf_connect_io_queue(nctrl, idx, false); - else + nvme_tcp_offload_socket(&ctrl->queues[idx]); + } else { ret = nvmf_connect_admin_queue(nctrl); + nvme_tcp_offload_limits(&ctrl->queues[idx]); + } if (!ret) { set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); @@ -1640,6 +1831,8 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) { int ret; + to_tcp_ctrl(ctrl)->offloading_netdev = NULL; + ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); if (ret) return ret;

[v1,net-next,05/15] nvme-tcp: Add DDP offload control path

Checks

Commit Message

Comments

Patch