Message ID | 20231214132623.119227-6-aaptel@nvidia.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | nvme-tcp receive offloads | expand |
On 14/12/2023 15:26, Aurelien Aptel wrote: > From: Boris Pismenny <borisp@nvidia.com> > > This commit introduces direct data placement offload to NVME > TCP. There is a context per queue, which is established after the > handshake using the sk_add/del NDOs. > > Additionally, a resynchronization routine is used to assist > hardware recovery from TCP OOO, and continue the offload. > Resynchronization operates as follows: > > 1. TCP OOO causes the NIC HW to stop the offload > > 2. NIC HW identifies a PDU header at some TCP sequence number, > and asks NVMe-TCP to confirm it. > This request is delivered from the NIC driver to NVMe-TCP by first > finding the socket for the packet that triggered the request, and > then finding the nvme_tcp_queue that is used by this routine. > Finally, the request is recorded in the nvme_tcp_queue. > > 3. When NVMe-TCP observes the requested TCP sequence, it will compare > it with the PDU header TCP sequence, and report the result to the > NIC driver (resync), which will update the HW, and resume offload > when all is successful. > > Some HW implementation such as ConnectX-7 assume linear CCID (0...N-1 > for queue of size N) where the linux nvme driver uses part of the 16 > bit CCID for generation counter. To address that, we use the existing > quirk in the nvme layer when the HW driver advertises if the device is > not supports the full 16 bit CCID range. > > Furthermore, we let the offloading driver advertise what is the max hw > sectors/segments via ulp_ddp_limits. > > A follow-up patch introduces the data-path changes required for this > offload. > > Socket operations need a netdev reference. This reference is > dropped on NETDEV_GOING_DOWN events to allow the device to go down in > a follow-up patch. > > Signed-off-by: Boris Pismenny <borisp@nvidia.com> > Signed-off-by: Ben Ben-Ishay <benishay@nvidia.com> > Signed-off-by: Or Gerlitz <ogerlitz@nvidia.com> > Signed-off-by: Yoray Zack <yorayz@nvidia.com> > Signed-off-by: Shai Malin <smalin@nvidia.com> > Signed-off-by: Aurelien Aptel <aaptel@nvidia.com> > Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> > Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> > --- > drivers/nvme/host/tcp.c | 264 ++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 251 insertions(+), 13 deletions(-) > > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c > index d79811cfa0ce..52b129401c78 100644 > --- a/drivers/nvme/host/tcp.c > +++ b/drivers/nvme/host/tcp.c > @@ -21,6 +21,10 @@ > #include <net/busy_poll.h> > #include <trace/events/sock.h> > > +#ifdef CONFIG_ULP_DDP > +#include <net/ulp_ddp.h> > +#endif > + > #include "nvme.h" > #include "fabrics.h" > > @@ -46,6 +50,16 @@ MODULE_PARM_DESC(tls_handshake_timeout, > "nvme TLS handshake timeout in seconds (default 10)"); > #endif > > +#ifdef CONFIG_ULP_DDP > +/* NVMeTCP direct data placement and data digest offload will not > + * happen if this parameter false (default), regardless of what the > + * underlying netdev capabilities are. > + */ > +static bool ddp_offload; > +module_param(ddp_offload, bool, 0644); > +MODULE_PARM_DESC(ddp_offload, "Enable or disable NVMeTCP direct data placement support"); > +#endif > + > #ifdef CONFIG_DEBUG_LOCK_ALLOC > /* lockdep can detect a circular dependency of the form > * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock > @@ -119,6 +133,7 @@ enum nvme_tcp_queue_flags { > NVME_TCP_Q_ALLOCATED = 0, > NVME_TCP_Q_LIVE = 1, > NVME_TCP_Q_POLLING = 2, > + NVME_TCP_Q_OFF_DDP = 3, > }; > > enum nvme_tcp_recv_state { > @@ -146,6 +161,18 @@ struct nvme_tcp_queue { > size_t ddgst_remaining; > unsigned int nr_cqe; > > +#ifdef CONFIG_ULP_DDP > + /* > + * resync_tcp_seq is a speculative PDU header tcp seq number (with > + * an additional flag in the lower 32 bits) that the HW send to > + * the SW, for the SW to verify. > + * - The 32 high bits store the seq number > + * - The 32 low bits are used as a flag to know if a request > + * is pending (ULP_DDP_RESYNC_PENDING). > + */ > + atomic64_t resync_tcp_seq; > +#endif > + > /* send state */ > struct nvme_tcp_request *request; > > @@ -186,6 +213,12 @@ struct nvme_tcp_ctrl { > struct delayed_work connect_work; > struct nvme_tcp_request async_req; > u32 io_queues[HCTX_MAX_TYPES]; > + > + struct net_device *ddp_netdev; > + u32 ddp_threshold; > +#ifdef CONFIG_ULP_DDP > + struct ulp_ddp_limits ddp_limits; > +#endif > }; > > static LIST_HEAD(nvme_tcp_ctrl_list); > @@ -297,6 +330,171 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, > return nvme_tcp_pdu_data_left(req) <= len; > } > > +#ifdef CONFIG_ULP_DDP > + > +static struct net_device * > +nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) > +{ > + struct net_device *netdev; > + int ret; > + > + if (!ddp_offload) > + return NULL; > + > + /* netdev ref is put in nvme_tcp_stop_admin_queue() */ > + netdev = get_netdev_for_sock(ctrl->queues[0].sock->sk); > + if (!netdev) { > + dev_dbg(ctrl->ctrl.device, "netdev not found\n"); > + return NULL; > + } > + > + if (!ulp_ddp_is_cap_active(netdev, ULP_DDP_CAP_NVME_TCP)) > + goto err; > + > + ret = ulp_ddp_get_limits(netdev, &ctrl->ddp_limits, ULP_DDP_NVME); > + if (ret) > + goto err; > + > + if (ctrl->ctrl.opts->tls && !ctrl->ddp_limits.tls) > + goto err; > + > + return netdev; > +err: > + dev_put(netdev); > + return NULL; > +} > + > +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); > +static const struct ulp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { > + .resync_request = nvme_tcp_resync_request, > +}; > + > +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) > +{ > + struct ulp_ddp_config config = {.type = ULP_DDP_NVME}; > + int ret; > + > + config.nvmeotcp.pfv = NVME_TCP_PFV_1_0; > + config.nvmeotcp.cpda = 0; > + config.nvmeotcp.dgst = > + queue->hdr_digest ? NVME_TCP_HDR_DIGEST_ENABLE : 0; > + config.nvmeotcp.dgst |= > + queue->data_digest ? NVME_TCP_DATA_DIGEST_ENABLE : 0; > + config.nvmeotcp.queue_size = queue->ctrl->ctrl.sqsize + 1; > + config.nvmeotcp.queue_id = nvme_tcp_queue_id(queue); > + > + ret = ulp_ddp_sk_add(queue->ctrl->ddp_netdev, > + queue->sock->sk, > + &config, > + &nvme_tcp_ddp_ulp_ops); > + if (ret) > + return ret; > + > + set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); > + > + return 0; > +} > + > +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) > +{ > + clear_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); > + ulp_ddp_sk_del(queue->ctrl->ddp_netdev, queue->sock->sk); > +} > + > +static void nvme_tcp_ddp_apply_limits(struct nvme_tcp_ctrl *ctrl) > +{ > + ctrl->ctrl.max_segments = ctrl->ddp_limits.max_ddp_sgl_len; > + ctrl->ctrl.max_hw_sectors = > + ctrl->ddp_limits.max_ddp_sgl_len << (ilog2(SZ_4K) - SECTOR_SHIFT); > + ctrl->ddp_threshold = ctrl->ddp_limits.io_threshold; > + > + /* offloading HW doesn't support full ccid range, apply the quirk */ > + ctrl->ctrl.quirks |= > + ctrl->ddp_limits.nvmeotcp.full_ccid_range ? 0 : NVME_QUIRK_SKIP_CID_GEN; > +} > + > +/* In presence of packet drops or network packet reordering, the device may lose > + * synchronization between the TCP stream and the L5P framing, and require a > + * resync with the kernel's TCP stack. > + * > + * - NIC HW identifies a PDU header at some TCP sequence number, > + * and asks NVMe-TCP to confirm it. > + * - When NVMe-TCP observes the requested TCP sequence, it will compare > + * it with the PDU header TCP sequence, and report the result to the > + * NIC driver > + */ > +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, > + struct sk_buff *skb, unsigned int offset) > +{ > + u64 pdu_seq = TCP_SKB_CB(skb)->seq + offset - queue->pdu_offset; > + struct net_device *netdev = queue->ctrl->ddp_netdev; > + u64 pdu_val = (pdu_seq << 32) | ULP_DDP_RESYNC_PENDING; > + u64 resync_val; > + u32 resync_seq; > + > + resync_val = atomic64_read(&queue->resync_tcp_seq); > + /* Lower 32 bit flags. Check validity of the request */ > + if ((resync_val & ULP_DDP_RESYNC_PENDING) == 0) > + return; > + > + /* > + * Obtain and check requested sequence number: is this PDU header > + * before the request? > + */ > + resync_seq = resync_val >> 32; > + if (before(pdu_seq, resync_seq)) > + return; > + > + /* > + * The atomic operation guarantees that we don't miss any NIC driver > + * resync requests submitted after the above checks. > + */ > + if (atomic64_cmpxchg(&queue->resync_tcp_seq, pdu_val, > + pdu_val & ~ULP_DDP_RESYNC_PENDING) != > + atomic64_read(&queue->resync_tcp_seq)) > + ulp_ddp_resync(netdev, queue->sock->sk, pdu_seq); > +} > + > +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) > +{ > + struct nvme_tcp_queue *queue = sk->sk_user_data; > + > + /* > + * "seq" (TCP seq number) is what the HW assumes is the > + * beginning of a PDU. The nvme-tcp layer needs to store the > + * number along with the "flags" (ULP_DDP_RESYNC_PENDING) to > + * indicate that a request is pending. > + */ > + atomic64_set(&queue->resync_tcp_seq, (((uint64_t)seq << 32) | flags)); > + > + return true; > +} > + > +#else > + > +static struct net_device * > +nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) > +{ > + return NULL; > +} > + > +static void nvme_tcp_ddp_apply_limits(struct nvme_tcp_ctrl *ctrl) > +{} > + > +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) > +{ > + return 0; > +} > + > +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) > +{} > + > +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, > + struct sk_buff *skb, unsigned int offset) > +{} > + > +#endif > + > static void nvme_tcp_init_iter(struct nvme_tcp_request *req, > unsigned int dir) > { > @@ -739,6 +937,9 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, > size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); > int ret; > > + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) > + nvme_tcp_resync_response(queue, skb, *offset); lets try to optimize the fast path with: if (IS_ENABLED(CONFIG_ULP_DDP) && test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) nvme_tcp_resync_response(queue, skb, *offset); > + > ret = skb_copy_bits(skb, *offset, > &pdu[queue->pdu_offset], rcv_len); > if (unlikely(ret)) > @@ -1804,6 +2005,8 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) > kernel_sock_shutdown(queue->sock, SHUT_RDWR); > nvme_tcp_restore_sock_ops(queue); > cancel_work_sync(&queue->io_work); > + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) > + nvme_tcp_unoffload_socket(queue); > } > > static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) > @@ -1820,6 +2023,20 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) > mutex_unlock(&queue->queue_lock); > } > > +static void nvme_tcp_stop_admin_queue(struct nvme_ctrl *nctrl) > +{ > + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); > + > + nvme_tcp_stop_queue(nctrl, 0); > + > + /* > + * We are called twice by nvme_tcp_teardown_admin_queue() > + * Set ddp_netdev to NULL to avoid putting it twice > + */ > + dev_put(ctrl->ddp_netdev); > + ctrl->ddp_netdev = NULL; > +} > + > static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue) > { > write_lock_bh(&queue->sock->sk->sk_callback_lock); > @@ -1846,19 +2063,37 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) > nvme_tcp_init_recv_ctx(queue); > nvme_tcp_setup_sock_ops(queue); > > - if (idx) > + if (idx) { > ret = nvmf_connect_io_queue(nctrl, idx); > - else > + if (ret) > + goto err; > + > + if (ctrl->ddp_netdev) { > + ret = nvme_tcp_offload_socket(queue); > + if (ret) { > + dev_info(nctrl->device, > + "failed to setup offload on queue %d ret=%d\n", > + idx, ret); > + } > + } > + } else { > ret = nvmf_connect_admin_queue(nctrl); > + if (ret) > + goto err; > + > + ctrl->ddp_netdev = nvme_tcp_get_ddp_netdev_with_limits(ctrl); > + if (ctrl->ddp_netdev) > + nvme_tcp_ddp_apply_limits(ctrl); > > - if (!ret) { > - set_bit(NVME_TCP_Q_LIVE, &queue->flags); > - } else { > - if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) > - __nvme_tcp_stop_queue(queue); > - dev_err(nctrl->device, > - "failed to connect queue: %d ret=%d\n", idx, ret); > } > + > + set_bit(NVME_TCP_Q_LIVE, &queue->flags); > + return 0; > +err: > + if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) > + __nvme_tcp_stop_queue(queue); > + dev_err(nctrl->device, > + "failed to connect queue: %d ret=%d\n", idx, ret); > return ret; > } > > @@ -2070,7 +2305,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) > > static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) > { > - nvme_tcp_stop_queue(ctrl, 0); > + nvme_tcp_stop_admin_queue(ctrl); > if (remove) > nvme_remove_admin_tag_set(ctrl); > nvme_tcp_free_admin_queue(ctrl); > @@ -2113,7 +2348,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) > nvme_quiesce_admin_queue(ctrl); > blk_sync_queue(ctrl->admin_q); > out_stop_queue: > - nvme_tcp_stop_queue(ctrl, 0); > + nvme_tcp_stop_admin_queue(ctrl); > nvme_cancel_admin_tagset(ctrl); > out_cleanup_tagset: > if (new) > @@ -2128,7 +2363,7 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, > { > nvme_quiesce_admin_queue(ctrl); > blk_sync_queue(ctrl->admin_q); > - nvme_tcp_stop_queue(ctrl, 0); > + nvme_tcp_stop_admin_queue(ctrl); > nvme_cancel_admin_tagset(ctrl); > if (remove) > nvme_unquiesce_admin_queue(ctrl); > @@ -2413,7 +2648,10 @@ static void nvme_tcp_complete_timed_out(struct request *rq) > struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); > struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; > > - nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); > + if (nvme_tcp_admin_queue(req->queue)) > + nvme_tcp_stop_admin_queue(ctrl); > + else > + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); > nvmf_complete_timed_out_request(rq); > } >
Max Gurtovoy <mgurtovoy@nvidia.com> writes: >> @@ -739,6 +937,9 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, >> size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); >> int ret; >> >> + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) >> + nvme_tcp_resync_response(queue, skb, *offset); > > lets try to optimize the fast path with: > > if (IS_ENABLED(CONFIG_ULP_DDP) && test_bit(NVME_TCP_Q_OFF_DDP, > &queue->flags)) > nvme_tcp_resync_response(queue, skb, *offset); > For this one, when ULP_DDP is disabled, I do see 1 extra mov instruction but no branching... I think it's negligible personally. $ gdb drivers/nvme/host/nvme-tcp.ko (gdb) disass /s nvme_tcp_recv_skb ... 1088 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, 1089 unsigned int *offset, size_t *len) 1090 { 1091 struct nvme_tcp_hdr *hdr; 1092 char *pdu = queue->pdu; 0x00000000000046a6 <+118>: mov %rsi,-0x70(%rbp) 880 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU : 0x00000000000046aa <+122>: test %ebx,%ebx 0x00000000000046ac <+124>: je 0x4975 <nvme_tcp_recv_skb+837> 1093 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); 0x00000000000046b2 <+130>: cmp %r14,%rbx 1100 &pdu[queue->pdu_offset], rcv_len); 0x00000000000046b5 <+133>: movslq 0x19c(%r12),%rdx 1099 ret = skb_copy_bits(skb, *offset, 0x00000000000046bd <+141>: mov -0x58(%rbp),%rdi 1093 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); 0x00000000000046c1 <+145>: cmova %r14,%rbx ./arch/x86/include/asm/bitops.h: 205 return ((1UL << (nr & (BITS_PER_LONG-1))) 0x00000000000046c5 <+149>: mov 0x1d8(%r12),%rax Extra mov of queue->flags offset here ^^^^^^^^ (gdb) p &((struct nvme_tcp_queue *)0)->flags $1 = (unsigned long *) 0x1d8
On 18/12/2023 22:00, Aurelien Aptel wrote: > Max Gurtovoy <mgurtovoy@nvidia.com> writes: >>> @@ -739,6 +937,9 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, >>> size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); >>> int ret; >>> >>> + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) >>> + nvme_tcp_resync_response(queue, skb, *offset); >> >> lets try to optimize the fast path with: >> >> if (IS_ENABLED(CONFIG_ULP_DDP) && test_bit(NVME_TCP_Q_OFF_DDP, >> &queue->flags)) >> nvme_tcp_resync_response(queue, skb, *offset); >> > > For this one, when ULP_DDP is disabled, I do see 1 extra mov instruction > but no branching... I think it's negligible personally. > > $ gdb drivers/nvme/host/nvme-tcp.ko > (gdb) disass /s nvme_tcp_recv_skb > ... > 1088 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, > 1089 unsigned int *offset, size_t *len) > 1090 { > 1091 struct nvme_tcp_hdr *hdr; > 1092 char *pdu = queue->pdu; > 0x00000000000046a6 <+118>: mov %rsi,-0x70(%rbp) > > 880 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU : > 0x00000000000046aa <+122>: test %ebx,%ebx > 0x00000000000046ac <+124>: je 0x4975 <nvme_tcp_recv_skb+837> > > 1093 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); > 0x00000000000046b2 <+130>: cmp %r14,%rbx > > 1100 &pdu[queue->pdu_offset], rcv_len); > 0x00000000000046b5 <+133>: movslq 0x19c(%r12),%rdx > > 1099 ret = skb_copy_bits(skb, *offset, > 0x00000000000046bd <+141>: mov -0x58(%rbp),%rdi > > 1093 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); > 0x00000000000046c1 <+145>: cmova %r14,%rbx > > ./arch/x86/include/asm/bitops.h: > 205 return ((1UL << (nr & (BITS_PER_LONG-1))) > 0x00000000000046c5 <+149>: mov 0x1d8(%r12),%rax > > Extra mov of queue->flags offset here ^^^^^^^^ > > (gdb) p &((struct nvme_tcp_queue *)0)->flags > $1 = (unsigned long *) 0x1d8 Ok we can keep it as is. Sagi, any comments on the NVMf patches or on others before we send next version ? we would like to make it merge for the 6_8 window..
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d79811cfa0ce..52b129401c78 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -21,6 +21,10 @@ #include <net/busy_poll.h> #include <trace/events/sock.h> +#ifdef CONFIG_ULP_DDP +#include <net/ulp_ddp.h> +#endif + #include "nvme.h" #include "fabrics.h" @@ -46,6 +50,16 @@ MODULE_PARM_DESC(tls_handshake_timeout, "nvme TLS handshake timeout in seconds (default 10)"); #endif +#ifdef CONFIG_ULP_DDP +/* NVMeTCP direct data placement and data digest offload will not + * happen if this parameter false (default), regardless of what the + * underlying netdev capabilities are. + */ +static bool ddp_offload; +module_param(ddp_offload, bool, 0644); +MODULE_PARM_DESC(ddp_offload, "Enable or disable NVMeTCP direct data placement support"); +#endif + #ifdef CONFIG_DEBUG_LOCK_ALLOC /* lockdep can detect a circular dependency of the form * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock @@ -119,6 +133,7 @@ enum nvme_tcp_queue_flags { NVME_TCP_Q_ALLOCATED = 0, NVME_TCP_Q_LIVE = 1, NVME_TCP_Q_POLLING = 2, + NVME_TCP_Q_OFF_DDP = 3, }; enum nvme_tcp_recv_state { @@ -146,6 +161,18 @@ struct nvme_tcp_queue { size_t ddgst_remaining; unsigned int nr_cqe; +#ifdef CONFIG_ULP_DDP + /* + * resync_tcp_seq is a speculative PDU header tcp seq number (with + * an additional flag in the lower 32 bits) that the HW send to + * the SW, for the SW to verify. + * - The 32 high bits store the seq number + * - The 32 low bits are used as a flag to know if a request + * is pending (ULP_DDP_RESYNC_PENDING). + */ + atomic64_t resync_tcp_seq; +#endif + /* send state */ struct nvme_tcp_request *request; @@ -186,6 +213,12 @@ struct nvme_tcp_ctrl { struct delayed_work connect_work; struct nvme_tcp_request async_req; u32 io_queues[HCTX_MAX_TYPES]; + + struct net_device *ddp_netdev; + u32 ddp_threshold; +#ifdef CONFIG_ULP_DDP + struct ulp_ddp_limits ddp_limits; +#endif }; static LIST_HEAD(nvme_tcp_ctrl_list); @@ -297,6 +330,171 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, return nvme_tcp_pdu_data_left(req) <= len; } +#ifdef CONFIG_ULP_DDP + +static struct net_device * +nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) +{ + struct net_device *netdev; + int ret; + + if (!ddp_offload) + return NULL; + + /* netdev ref is put in nvme_tcp_stop_admin_queue() */ + netdev = get_netdev_for_sock(ctrl->queues[0].sock->sk); + if (!netdev) { + dev_dbg(ctrl->ctrl.device, "netdev not found\n"); + return NULL; + } + + if (!ulp_ddp_is_cap_active(netdev, ULP_DDP_CAP_NVME_TCP)) + goto err; + + ret = ulp_ddp_get_limits(netdev, &ctrl->ddp_limits, ULP_DDP_NVME); + if (ret) + goto err; + + if (ctrl->ctrl.opts->tls && !ctrl->ddp_limits.tls) + goto err; + + return netdev; +err: + dev_put(netdev); + return NULL; +} + +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); +static const struct ulp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { + .resync_request = nvme_tcp_resync_request, +}; + +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + struct ulp_ddp_config config = {.type = ULP_DDP_NVME}; + int ret; + + config.nvmeotcp.pfv = NVME_TCP_PFV_1_0; + config.nvmeotcp.cpda = 0; + config.nvmeotcp.dgst = + queue->hdr_digest ? NVME_TCP_HDR_DIGEST_ENABLE : 0; + config.nvmeotcp.dgst |= + queue->data_digest ? NVME_TCP_DATA_DIGEST_ENABLE : 0; + config.nvmeotcp.queue_size = queue->ctrl->ctrl.sqsize + 1; + config.nvmeotcp.queue_id = nvme_tcp_queue_id(queue); + + ret = ulp_ddp_sk_add(queue->ctrl->ddp_netdev, + queue->sock->sk, + &config, + &nvme_tcp_ddp_ulp_ops); + if (ret) + return ret; + + set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + + return 0; +} + +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{ + clear_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + ulp_ddp_sk_del(queue->ctrl->ddp_netdev, queue->sock->sk); +} + +static void nvme_tcp_ddp_apply_limits(struct nvme_tcp_ctrl *ctrl) +{ + ctrl->ctrl.max_segments = ctrl->ddp_limits.max_ddp_sgl_len; + ctrl->ctrl.max_hw_sectors = + ctrl->ddp_limits.max_ddp_sgl_len << (ilog2(SZ_4K) - SECTOR_SHIFT); + ctrl->ddp_threshold = ctrl->ddp_limits.io_threshold; + + /* offloading HW doesn't support full ccid range, apply the quirk */ + ctrl->ctrl.quirks |= + ctrl->ddp_limits.nvmeotcp.full_ccid_range ? 0 : NVME_QUIRK_SKIP_CID_GEN; +} + +/* In presence of packet drops or network packet reordering, the device may lose + * synchronization between the TCP stream and the L5P framing, and require a + * resync with the kernel's TCP stack. + * + * - NIC HW identifies a PDU header at some TCP sequence number, + * and asks NVMe-TCP to confirm it. + * - When NVMe-TCP observes the requested TCP sequence, it will compare + * it with the PDU header TCP sequence, and report the result to the + * NIC driver + */ +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int offset) +{ + u64 pdu_seq = TCP_SKB_CB(skb)->seq + offset - queue->pdu_offset; + struct net_device *netdev = queue->ctrl->ddp_netdev; + u64 pdu_val = (pdu_seq << 32) | ULP_DDP_RESYNC_PENDING; + u64 resync_val; + u32 resync_seq; + + resync_val = atomic64_read(&queue->resync_tcp_seq); + /* Lower 32 bit flags. Check validity of the request */ + if ((resync_val & ULP_DDP_RESYNC_PENDING) == 0) + return; + + /* + * Obtain and check requested sequence number: is this PDU header + * before the request? + */ + resync_seq = resync_val >> 32; + if (before(pdu_seq, resync_seq)) + return; + + /* + * The atomic operation guarantees that we don't miss any NIC driver + * resync requests submitted after the above checks. + */ + if (atomic64_cmpxchg(&queue->resync_tcp_seq, pdu_val, + pdu_val & ~ULP_DDP_RESYNC_PENDING) != + atomic64_read(&queue->resync_tcp_seq)) + ulp_ddp_resync(netdev, queue->sock->sk, pdu_seq); +} + +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) +{ + struct nvme_tcp_queue *queue = sk->sk_user_data; + + /* + * "seq" (TCP seq number) is what the HW assumes is the + * beginning of a PDU. The nvme-tcp layer needs to store the + * number along with the "flags" (ULP_DDP_RESYNC_PENDING) to + * indicate that a request is pending. + */ + atomic64_set(&queue->resync_tcp_seq, (((uint64_t)seq << 32) | flags)); + + return true; +} + +#else + +static struct net_device * +nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) +{ + return NULL; +} + +static void nvme_tcp_ddp_apply_limits(struct nvme_tcp_ctrl *ctrl) +{} + +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + return 0; +} + +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{} + +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int offset) +{} + +#endif + static void nvme_tcp_init_iter(struct nvme_tcp_request *req, unsigned int dir) { @@ -739,6 +937,9 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); int ret; + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_resync_response(queue, skb, *offset); + ret = skb_copy_bits(skb, *offset, &pdu[queue->pdu_offset], rcv_len); if (unlikely(ret)) @@ -1804,6 +2005,8 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) kernel_sock_shutdown(queue->sock, SHUT_RDWR); nvme_tcp_restore_sock_ops(queue); cancel_work_sync(&queue->io_work); + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_unoffload_socket(queue); } static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) @@ -1820,6 +2023,20 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) mutex_unlock(&queue->queue_lock); } +static void nvme_tcp_stop_admin_queue(struct nvme_ctrl *nctrl) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + + nvme_tcp_stop_queue(nctrl, 0); + + /* + * We are called twice by nvme_tcp_teardown_admin_queue() + * Set ddp_netdev to NULL to avoid putting it twice + */ + dev_put(ctrl->ddp_netdev); + ctrl->ddp_netdev = NULL; +} + static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue) { write_lock_bh(&queue->sock->sk->sk_callback_lock); @@ -1846,19 +2063,37 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) nvme_tcp_init_recv_ctx(queue); nvme_tcp_setup_sock_ops(queue); - if (idx) + if (idx) { ret = nvmf_connect_io_queue(nctrl, idx); - else + if (ret) + goto err; + + if (ctrl->ddp_netdev) { + ret = nvme_tcp_offload_socket(queue); + if (ret) { + dev_info(nctrl->device, + "failed to setup offload on queue %d ret=%d\n", + idx, ret); + } + } + } else { ret = nvmf_connect_admin_queue(nctrl); + if (ret) + goto err; + + ctrl->ddp_netdev = nvme_tcp_get_ddp_netdev_with_limits(ctrl); + if (ctrl->ddp_netdev) + nvme_tcp_ddp_apply_limits(ctrl); - if (!ret) { - set_bit(NVME_TCP_Q_LIVE, &queue->flags); - } else { - if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) - __nvme_tcp_stop_queue(queue); - dev_err(nctrl->device, - "failed to connect queue: %d ret=%d\n", idx, ret); } + + set_bit(NVME_TCP_Q_LIVE, &queue->flags); + return 0; +err: + if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + __nvme_tcp_stop_queue(queue); + dev_err(nctrl->device, + "failed to connect queue: %d ret=%d\n", idx, ret); return ret; } @@ -2070,7 +2305,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - nvme_tcp_stop_queue(ctrl, 0); + nvme_tcp_stop_admin_queue(ctrl); if (remove) nvme_remove_admin_tag_set(ctrl); nvme_tcp_free_admin_queue(ctrl); @@ -2113,7 +2348,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: - nvme_tcp_stop_queue(ctrl, 0); + nvme_tcp_stop_admin_queue(ctrl); nvme_cancel_admin_tagset(ctrl); out_cleanup_tagset: if (new) @@ -2128,7 +2363,7 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, { nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); - nvme_tcp_stop_queue(ctrl, 0); + nvme_tcp_stop_admin_queue(ctrl); nvme_cancel_admin_tagset(ctrl); if (remove) nvme_unquiesce_admin_queue(ctrl); @@ -2413,7 +2648,10 @@ static void nvme_tcp_complete_timed_out(struct request *rq) struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; - nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + if (nvme_tcp_admin_queue(req->queue)) + nvme_tcp_stop_admin_queue(ctrl); + else + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); nvmf_complete_timed_out_request(rq); }