Message ID | 20210211211044.32701-7-borisp@mellanox.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | nvme-tcp receive offloads | expand |
On 2/11/21 2:10 PM, Boris Pismenny wrote: > @@ -223,6 +229,164 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, > return nvme_tcp_pdu_data_left(req) <= len; > } > > +#ifdef CONFIG_TCP_DDP > + > +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); > +static const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { > + .resync_request = nvme_tcp_resync_request, > +}; > + > +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) > +{ > + struct net_device *netdev = queue->ctrl->offloading_netdev; > + struct nvme_tcp_ddp_config config = {}; > + int ret; > + > + if (!(netdev->features & NETIF_F_HW_TCP_DDP)) If nvme_tcp_offload_limits does not find a dst_entry on the socket then offloading_netdev may not NULL at this point. > + return -EOPNOTSUPP; > + > + config.cfg.type = TCP_DDP_NVME; > + config.pfv = NVME_TCP_PFV_1_0; > + config.cpda = 0; > + config.dgst = queue->hdr_digest ? > + NVME_TCP_HDR_DIGEST_ENABLE : 0; > + config.dgst |= queue->data_digest ? > + NVME_TCP_DATA_DIGEST_ENABLE : 0; > + config.queue_size = queue->queue_size; > + config.queue_id = nvme_tcp_queue_id(queue); > + config.io_cpu = queue->io_cpu; > + > + dev_hold(netdev); /* put by unoffload_socket */ > + ret = netdev->tcp_ddp_ops->tcp_ddp_sk_add(netdev, > + queue->sock->sk, > + &config.cfg); > + if (ret) { > + dev_put(netdev); > + return ret; > + } > + > + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops; > + if (netdev->features & NETIF_F_HW_TCP_DDP) > + set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); > + > + return ret; > +} > + > +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) > +{ > + struct net_device *netdev = queue->ctrl->offloading_netdev; > + > + if (!netdev) { > + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); you are already logged in nvme_tcp_offload_limits that get_netdev_for_sock returned NULL; no need to do it again. > + return; > + } > + > + netdev->tcp_ddp_ops->tcp_ddp_sk_del(netdev, queue->sock->sk); > + > + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL; > + dev_put(netdev); /* held by offload_socket */ > +} > + > +static int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue) > +{ > + struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true); > + struct tcp_ddp_limits limits; > + int ret = 0; > + > + if (!netdev) { > + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); This should be more informative. > + queue->ctrl->offloading_netdev = NULL; > + return -ENODEV; > + } > + > + if (netdev->features & NETIF_F_HW_TCP_DDP && > + netdev->tcp_ddp_ops && > + netdev->tcp_ddp_ops->tcp_ddp_limits) > + ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, &limits); > + else > + ret = -EOPNOTSUPP; > + > + if (!ret) { > + queue->ctrl->offloading_netdev = netdev; you save a reference to the netdev here, but then release the refcnt below. That device could be deleted between this point in time and the initialization of all queues. > + dev_dbg_ratelimited(queue->ctrl->ctrl.device, > + "netdev %s offload limits: max_ddp_sgl_len %d\n", > + netdev->name, limits.max_ddp_sgl_len); > + queue->ctrl->ctrl.max_segments = limits.max_ddp_sgl_len; > + queue->ctrl->ctrl.max_hw_sectors = > + limits.max_ddp_sgl_len << (ilog2(SZ_4K) - 9); > + } else { > + queue->ctrl->offloading_netdev = NULL; > + } > + > + /* release the device as no offload context is established yet. */ > + dev_put(netdev); > + > + return ret; > +} > + > +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, > + struct sk_buff *skb, unsigned int offset) > +{ > + u64 pdu_seq = TCP_SKB_CB(skb)->seq + offset - queue->pdu_offset; > + struct net_device *netdev = queue->ctrl->offloading_netdev; > + u64 resync_val; > + u32 resync_seq; > + > + resync_val = atomic64_read(&queue->resync_req); > + /* Lower 32 bit flags. Check validity of the request */ > + if ((resync_val & TCP_DDP_RESYNC_REQ) == 0) > + return; > + > + /* Obtain and check requested sequence number: is this PDU header before the request? */ > + resync_seq = resync_val >> 32; > + if (before(pdu_seq, resync_seq)) > + return; > + > + if (unlikely(!netdev)) { > + pr_info_ratelimited("%s: netdev not found\n", __func__); can't happen right? you get here because NVME_TCP_Q_OFF_DDP is set and it is only set if offloading_netdev is set and the device supports offload. > + return; > + } > + > + /** > + * The atomic operation gurarantees that we don't miss any NIC driver > + * resync requests submitted after the above checks. > + */ > + if (atomic64_cmpxchg(&queue->resync_req, resync_val, > + resync_val & ~TCP_DDP_RESYNC_REQ)) > + netdev->tcp_ddp_ops->tcp_ddp_resync(netdev, queue->sock->sk, pdu_seq); > +} > +
On Sun, Feb 14, 2021 at 8:20 PM David Ahern <dsahern@gmail.com> wrote: > On 2/11/21 2:10 PM, Boris Pismenny wrote: > > @@ -223,6 +229,164 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, > > return nvme_tcp_pdu_data_left(req) <= len; > > } Hi Dave, Thanks for the continuous feedback. Folks are out this week and it seems for that few comments we will need to discuss internally, but anyway I will address at least some of the comments later today/tomorrow. Or. Or.
On Sun, Feb 14, 2021 at 8:20 PM David Ahern <dsahern@gmail.com> wrote: > On 2/11/21 2:10 PM, Boris Pismenny wrote: > > @@ -223,6 +229,164 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, > > return nvme_tcp_pdu_data_left(req) <= len; > > } > > > > +#ifdef CONFIG_TCP_DDP > > + > > +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); > > +static const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { > > + .resync_request = nvme_tcp_resync_request, > > +}; > > + > > +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) > > +{ > > + struct net_device *netdev = queue->ctrl->offloading_netdev; > > + struct nvme_tcp_ddp_config config = {}; > > + int ret; > > + > > + if (!(netdev->features & NETIF_F_HW_TCP_DDP)) > > If nvme_tcp_offload_limits does not find a dst_entry on the socket then > offloading_netdev may not NULL at this point. correct :( will look on that > > > + return -EOPNOTSUPP; > > + > > + config.cfg.type = TCP_DDP_NVME; > > + config.pfv = NVME_TCP_PFV_1_0; > > + config.cpda = 0; > > + config.dgst = queue->hdr_digest ? > > + NVME_TCP_HDR_DIGEST_ENABLE : 0; > > + config.dgst |= queue->data_digest ? > > + NVME_TCP_DATA_DIGEST_ENABLE : 0; > > + config.queue_size = queue->queue_size; > > + config.queue_id = nvme_tcp_queue_id(queue); > > + config.io_cpu = queue->io_cpu; > > + > > + dev_hold(netdev); /* put by unoffload_socket */ > > + ret = netdev->tcp_ddp_ops->tcp_ddp_sk_add(netdev, > > + queue->sock->sk, > > + &config.cfg); > > + if (ret) { > > + dev_put(netdev); > > + return ret; > > + } > > + > > + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops; > > + if (netdev->features & NETIF_F_HW_TCP_DDP) > > + set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); > > + > > + return ret; > > +} > > + > > +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) > > +{ > > + struct net_device *netdev = queue->ctrl->offloading_netdev; > > + > > + if (!netdev) { > > + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); > > you are already logged in nvme_tcp_offload_limits that > get_netdev_for_sock returned NULL; no need to do it again. yeah, re this one and the few other places where you commented on the same or similar thing, I tend to agree we need to go on the kernel trusted programming paradigm and avoid over checking, will discuss that with the team. > > + return; > > + } > > + > > + netdev->tcp_ddp_ops->tcp_ddp_sk_del(netdev, queue->sock->sk); > > + > > + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL; > > + dev_put(netdev); /* held by offload_socket */ > > +} > > + > > +static int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue) > > +{ > > + struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true); > > + struct tcp_ddp_limits limits; > > + int ret = 0; > > + > > + if (!netdev) { > > + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); > > This should be more informative. okk > > + queue->ctrl->offloading_netdev = NULL; > > + return -ENODEV; > > + } > > + > > + if (netdev->features & NETIF_F_HW_TCP_DDP && > > + netdev->tcp_ddp_ops && > > + netdev->tcp_ddp_ops->tcp_ddp_limits) > > + ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, &limits); > > + else > > + ret = -EOPNOTSUPP; > > + > > + if (!ret) { > > + queue->ctrl->offloading_netdev = netdev; > > you save a reference to the netdev here, but then release the refcnt > below. That device could be deleted between this point in time and the > initialization of all queues. > > + dev_dbg_ratelimited(queue->ctrl->ctrl.device, > > + "netdev %s offload limits: max_ddp_sgl_len %d\n", > > + netdev->name, limits.max_ddp_sgl_len); > > + queue->ctrl->ctrl.max_segments = limits.max_ddp_sgl_len; > > + queue->ctrl->ctrl.max_hw_sectors = > > + limits.max_ddp_sgl_len << (ilog2(SZ_4K) - 9); > > + } else { > > + queue->ctrl->offloading_netdev = NULL; > > + } > > + > > + /* release the device as no offload context is established yet. */ > > + dev_put(netdev); > > + > > + return ret; > > +} > > + > > +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, > > + struct sk_buff *skb, unsigned int offset) > > +{ > > + u64 pdu_seq = TCP_SKB_CB(skb)->seq + offset - queue->pdu_offset; > > + struct net_device *netdev = queue->ctrl->offloading_netdev; > > + u64 resync_val; > > + u32 resync_seq; > > + > > + resync_val = atomic64_read(&queue->resync_req); > > + /* Lower 32 bit flags. Check validity of the request */ > > + if ((resync_val & TCP_DDP_RESYNC_REQ) == 0) > > + return; > > + > > + /* Obtain and check requested sequence number: is this PDU header before the request? */ > > + resync_seq = resync_val >> 32; > > + if (before(pdu_seq, resync_seq)) > > + return; > > + > > + if (unlikely(!netdev)) { > > + pr_info_ratelimited("%s: netdev not found\n", __func__); > > can't happen right? you get here because NVME_TCP_Q_OFF_DDP is set and > it is only set if offloading_netdev is set and the device supports offload. as I wrote above, will discuss this general comment and likely go the direction you are pointing on
On 17/02/2021 15:55, Or Gerlitz wrote: > On Sun, Feb 14, 2021 at 8:20 PM David Ahern <dsahern@gmail.com> wrote: >> On 2/11/21 2:10 PM, Boris Pismenny wrote: >>> @@ -223,6 +229,164 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, >>> return nvme_tcp_pdu_data_left(req) <= len; >>> } >>> >>> +#ifdef CONFIG_TCP_DDP >>> + >>> +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); >>> +static const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { >>> + .resync_request = nvme_tcp_resync_request, >>> +}; >>> + >>> +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) >>> +{ >>> + struct net_device *netdev = queue->ctrl->offloading_netdev; >>> + struct nvme_tcp_ddp_config config = {}; >>> + int ret; >>> + >>> + if (!(netdev->features & NETIF_F_HW_TCP_DDP)) >> >> If nvme_tcp_offload_limits does not find a dst_entry on the socket then >> offloading_netdev may not NULL at this point. > > correct :( will look on that > That's only partially true. If nvme_tcp_offload_limits finds a dst_entry, but then the netdevice goes down, then the check here will catch it. This is needed because nvme_tcp_offload_limits doesn't hold a reference! We opted not to grab a reference on nvme_tcp_offload_limits because it doesn't create a context. >>> + queue->ctrl->offloading_netdev = NULL; >>> + return -ENODEV; >>> + } >>> + >>> + if (netdev->features & NETIF_F_HW_TCP_DDP && >>> + netdev->tcp_ddp_ops && >>> + netdev->tcp_ddp_ops->tcp_ddp_limits) >>> + ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, &limits); >>> + else >>> + ret = -EOPNOTSUPP; >>> + >>> + if (!ret) { >>> + queue->ctrl->offloading_netdev = netdev; >> >> you save a reference to the netdev here, but then release the refcnt >> below. That device could be deleted between this point in time and the >> initialization of all queues. > That's true, and this is why we repeat the checks there. We avoid holding the reference here because there is no obvious complementary release point for nvme_tcp_offload_limits and there is no hardware context created here, so there is no real need to hold it at this stage.
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 881d28eb15e9..36de4391ba76 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -14,6 +14,7 @@ #include <linux/blk-mq.h> #include <crypto/hash.h> #include <net/busy_poll.h> +#include <net/tcp_ddp.h> #include "nvme.h" #include "fabrics.h" @@ -62,6 +63,7 @@ enum nvme_tcp_queue_flags { NVME_TCP_Q_ALLOCATED = 0, NVME_TCP_Q_LIVE = 1, NVME_TCP_Q_POLLING = 2, + NVME_TCP_Q_OFF_DDP = 3, }; enum nvme_tcp_recv_state { @@ -111,6 +113,8 @@ struct nvme_tcp_queue { void (*state_change)(struct sock *); void (*data_ready)(struct sock *); void (*write_space)(struct sock *); + + atomic64_t resync_req; }; struct nvme_tcp_ctrl { @@ -129,6 +133,8 @@ struct nvme_tcp_ctrl { struct delayed_work connect_work; struct nvme_tcp_request async_req; u32 io_queues[HCTX_MAX_TYPES]; + + struct net_device *offloading_netdev; }; static LIST_HEAD(nvme_tcp_ctrl_list); @@ -223,6 +229,164 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, return nvme_tcp_pdu_data_left(req) <= len; } +#ifdef CONFIG_TCP_DDP + +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); +static const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { + .resync_request = nvme_tcp_resync_request, +}; + +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = queue->ctrl->offloading_netdev; + struct nvme_tcp_ddp_config config = {}; + int ret; + + if (!(netdev->features & NETIF_F_HW_TCP_DDP)) + return -EOPNOTSUPP; + + config.cfg.type = TCP_DDP_NVME; + config.pfv = NVME_TCP_PFV_1_0; + config.cpda = 0; + config.dgst = queue->hdr_digest ? + NVME_TCP_HDR_DIGEST_ENABLE : 0; + config.dgst |= queue->data_digest ? + NVME_TCP_DATA_DIGEST_ENABLE : 0; + config.queue_size = queue->queue_size; + config.queue_id = nvme_tcp_queue_id(queue); + config.io_cpu = queue->io_cpu; + + dev_hold(netdev); /* put by unoffload_socket */ + ret = netdev->tcp_ddp_ops->tcp_ddp_sk_add(netdev, + queue->sock->sk, + &config.cfg); + if (ret) { + dev_put(netdev); + return ret; + } + + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops; + if (netdev->features & NETIF_F_HW_TCP_DDP) + set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + + return ret; +} + +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = queue->ctrl->offloading_netdev; + + if (!netdev) { + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); + return; + } + + netdev->tcp_ddp_ops->tcp_ddp_sk_del(netdev, queue->sock->sk); + + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL; + dev_put(netdev); /* held by offload_socket */ +} + +static int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true); + struct tcp_ddp_limits limits; + int ret = 0; + + if (!netdev) { + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); + queue->ctrl->offloading_netdev = NULL; + return -ENODEV; + } + + if (netdev->features & NETIF_F_HW_TCP_DDP && + netdev->tcp_ddp_ops && + netdev->tcp_ddp_ops->tcp_ddp_limits) + ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, &limits); + else + ret = -EOPNOTSUPP; + + if (!ret) { + queue->ctrl->offloading_netdev = netdev; + dev_dbg_ratelimited(queue->ctrl->ctrl.device, + "netdev %s offload limits: max_ddp_sgl_len %d\n", + netdev->name, limits.max_ddp_sgl_len); + queue->ctrl->ctrl.max_segments = limits.max_ddp_sgl_len; + queue->ctrl->ctrl.max_hw_sectors = + limits.max_ddp_sgl_len << (ilog2(SZ_4K) - 9); + } else { + queue->ctrl->offloading_netdev = NULL; + } + + /* release the device as no offload context is established yet. */ + dev_put(netdev); + + return ret; +} + +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int offset) +{ + u64 pdu_seq = TCP_SKB_CB(skb)->seq + offset - queue->pdu_offset; + struct net_device *netdev = queue->ctrl->offloading_netdev; + u64 resync_val; + u32 resync_seq; + + resync_val = atomic64_read(&queue->resync_req); + /* Lower 32 bit flags. Check validity of the request */ + if ((resync_val & TCP_DDP_RESYNC_REQ) == 0) + return; + + /* Obtain and check requested sequence number: is this PDU header before the request? */ + resync_seq = resync_val >> 32; + if (before(pdu_seq, resync_seq)) + return; + + if (unlikely(!netdev)) { + pr_info_ratelimited("%s: netdev not found\n", __func__); + return; + } + + /** + * The atomic operation gurarantees that we don't miss any NIC driver + * resync requests submitted after the above checks. + */ + if (atomic64_cmpxchg(&queue->resync_req, resync_val, + resync_val & ~TCP_DDP_RESYNC_REQ)) + netdev->tcp_ddp_ops->tcp_ddp_resync(netdev, queue->sock->sk, pdu_seq); +} + +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) +{ + struct nvme_tcp_queue *queue = sk->sk_user_data; + + atomic64_set(&queue->resync_req, + (((uint64_t)seq << 32) | flags)); + + return true; +} + +#else + +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + return -EINVAL; +} + +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{} + +static int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue) +{ + return -EINVAL; +} + +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int offset) +{} + +#endif + static void nvme_tcp_init_iter(struct nvme_tcp_request *req, unsigned int dir) { @@ -638,6 +802,9 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); int ret; + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_resync_response(queue, skb, *offset); + ret = skb_copy_bits(skb, *offset, &pdu[queue->pdu_offset], rcv_len); if (unlikely(ret)) @@ -1532,6 +1699,9 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) kernel_sock_shutdown(queue->sock, SHUT_RDWR); nvme_tcp_restore_sock_calls(queue); cancel_work_sync(&queue->io_work); + + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_unoffload_socket(queue); } static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) @@ -1550,10 +1720,13 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); int ret; - if (idx) + if (idx) { ret = nvmf_connect_io_queue(nctrl, idx, false); - else + nvme_tcp_offload_socket(&ctrl->queues[idx]); + } else { ret = nvmf_connect_admin_queue(nctrl); + nvme_tcp_offload_limits(&ctrl->queues[idx]); + } if (!ret) { set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);