[v5,net-next,01/36] net: Introduce direct data placement tcp offload

Message ID	20210722110325.371-2-borisp@nvidia.com (mailing list archive)
State	Changes Requested
Delegated to:	Netdev Maintainers
Headers	show Return-Path: <netdev-owner@kernel.org> Received-SPF: Pass (protection.outlook.com: domain of nvidia.com designates 216.228.112.35 as permitted sender) receiver=protection.outlook.com; client-ip=216.228.112.35; helo=mail.nvidia.com; From: Boris Pismenny <borisp@nvidia.com> To: <dsahern@gmail.com>, <kuba@kernel.org>, <davem@davemloft.net>, <saeedm@nvidia.com>, <hch@lst.de>, <sagi@grimberg.me>, <axboe@fb.com>, <kbusch@kernel.org>, <viro@zeniv.linux.org.uk>, <edumazet@google.com>, <smalin@marvell.com> CC: <boris.pismenny@gmail.com>, <linux-nvme@lists.infradead.org>, <netdev@vger.kernel.org>, <benishay@nvidia.com>, <ogerlitz@nvidia.com>, <yorayz@nvidia.com>, Boris Pismenny <borisp@mellanox.com>, Ben Ben-Ishay <benishay@mellanox.com>, Or Gerlitz <ogerlitz@mellanox.com>, Yoray Zack <yorayz@mellanox.com> Subject: [PATCH v5 net-next 01/36] net: Introduce direct data placement tcp offload Date: Thu, 22 Jul 2021 14:02:50 +0300 Message-ID: <20210722110325.371-2-borisp@nvidia.com> In-Reply-To: <20210722110325.371-1-borisp@nvidia.com> References: <20210722110325.371-1-borisp@nvidia.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain Precedence: bulk
Series	nvme-tcp receive and tarnsmit offloads \| expand [v5,net-next,00/36] nvme-tcp receive and tarnsmit offloads [v5,net-next,01/36] net: Introduce direct data placement tcp offload [v5,net-next,02/36] iov_iter: DDP copy to iter/pages [v5,net-next,03/36] net: skb copy(+hash) iterators for DDP offloads [v5,net-next,04/36] net/tls: expose get_netdev_for_sock [v5,net-next,05/36] nvme-tcp: Add DDP offload control path [v5,net-next,06/36] nvme-tcp: Add DDP data-path [v5,net-next,07/36] nvme-tcp: RX DDGST offload [v5,net-next,08/36] nvme-tcp: Deal with netdevice DOWN events [v5,net-next,09/36] net/mlx5: Header file changes for nvme-tcp offload [v5,net-next,10/36] net/mlx5: Add 128B CQE for NVMEoTCP offload [v5,net-next,11/36] net/mlx5e: TCP flow steering for nvme-tcp [v5,net-next,12/36] net/mlx5e: NVMEoTCP offload initialization [v5,net-next,13/36] net/mlx5e: KLM UMR helper macros [v5,net-next,14/36] net/mlx5e: NVMEoTCP use KLM UMRs [v5,net-next,15/36] net/mlx5e: NVMEoTCP queue init/teardown [v5,net-next,16/36] net/mlx5e: NVMEoTCP async ddp invalidation [v5,net-next,17/36] net/mlx5e: NVMEoTCP ddp setup and resync [v5,net-next,18/36] net/mlx5e: NVMEoTCP, data-path for DDP+DDGST offload [v5,net-next,19/36] net/mlx5e: NVMEoTCP statistics [v5,net-next,20/36] Documentation: add ULP DDP offload documentation [v5,net-next,21/36] net: drop ULP DDP HW offload feature if no CSUM offload feature [v5,net-next,22/36] net: Add ulp_ddp_pdu_info struct [v5,net-next,23/36] net: Add to ulp_ddp support for fallback flow [v5,net-next,24/36] net: Add MSG_DDP_CRC flag [v5,net-next,25/36] nvme-tcp: TX DDGST offload [v5,net-next,26/36] nvme-tcp: Mapping between Tx NVMEoTCP pdu and TCP sequence [v5,net-next,27/36] mlx5e: make preparation in TLS code for NVMEoTCP CRC Tx offload [v5,net-next,28/36] mlx5: Add sq state test bit for nvmeotcp [v5,net-next,29/36] mlx5: Add support to NETIF_F_HW_TCP_DDP_CRC_TX feature [v5,net-next,30/36] net/mlx5e: NVMEoTCP DDGST TX offload TIS [v5,net-next,31/36] net/mlx5e: NVMEoTCP DDGST Tx offload queue init/teardown [v5,net-next,32/36] net/mlx5e: NVMEoTCP DDGST TX BSF and PSV [v5,net-next,33/36] net/mlx5e: NVMEoTCP DDGST TX Data path [v5,net-next,34/36] net/mlx5e: NVMEoTCP DDGST TX handle OOO packets [v5,net-next,35/36] net/mlx5e: NVMEoTCP DDGST TX offload optimization [v5,net-next,36/36] net/mlx5e: NVMEoTCP DDGST TX statistics

Context	Check	Description
netdev/apply	fail	Patch does not apply to net-next
netdev/tree_selection	success	Clearly marked for net-next

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2c6b9e416225..d9bd6ea26fc8 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -14,7 +14,7 @@ typedef u64 netdev_features_t; enum { NETIF_F_SG_BIT, /* Scatter/gather IO. */ NETIF_F_IP_CSUM_BIT, /* Can checksum TCP/UDP over IPv4. */ - __UNUSED_NETIF_F_1, + NETIF_F_HW_ULP_DDP_BIT, /* ULP direct data placement offload */ NETIF_F_HW_CSUM_BIT, /* Can checksum all the packets. */ NETIF_F_IPV6_CSUM_BIT, /* Can checksum TCP/UDP over IPV6 */ NETIF_F_HIGHDMA_BIT, /* Can DMA to high memory. */ @@ -168,6 +168,7 @@ enum { #define NETIF_F_HW_HSR_TAG_RM __NETIF_F(HW_HSR_TAG_RM) #define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) #define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) +#define NETIF_F_HW_ULP_DDP __NETIF_F(HW_ULP_DDP) /* Finds the next feature with the highest number of the range of start till 0. */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eaf5bb008aa9..cba92c2dd9c0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1005,6 +1005,7 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; +struct ulp_ddp_dev_ops; struct netdev_name_node { struct hlist_node hlist; @@ -2024,6 +2025,10 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif +#if IS_ENABLED(CONFIG_ULP_DDP) + const struct ulp_ddp_dev_ops *ulp_ddp_ops; +#endif + const struct header_ops *header_ops; unsigned char operstate; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2db9cd9a73f..d323ecd37448 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -689,6 +689,7 @@ typedef unsigned char *sk_buff_data_t; * CHECKSUM_UNNECESSARY (max 3) * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB + * @ddp_crc: DDP or CRC offloaded * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @secmark: security marking @@ -870,6 +871,9 @@ struct sk_buff { #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif +#ifdef CONFIG_ULP_DDP + __u8 ddp_crc:1; +#endif #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b06c2d02ec84..66801ea72fb4 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -66,6 +66,8 @@ struct inet_connection_sock_af_ops { * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data * @icsk_clean_acked Clean acked data hook + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook + * @icsk_ulp_ddp_data ULP direct data placement private data * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts @@ -96,6 +98,8 @@ struct inet_connection_sock { const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); + const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops; + void __rcu *icsk_ulp_ddp_data; struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, diff --git a/include/net/ulp_ddp.h b/include/net/ulp_ddp.h new file mode 100644 index 000000000000..1a0b464ff40b --- /dev/null +++ b/include/net/ulp_ddp.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * ulp_ddp.h + * Author: Boris Pismenny <borisp@mellanox.com> + * Copyright (C) 2021 Mellanox Technologies. + */ +#ifndef _ULP_DDP_H +#define _ULP_DDP_H + +#include <linux/netdevice.h> +#include <net/inet_connection_sock.h> +#include <net/sock.h> + +/* limits returned by the offload driver, zero means don't care */ +struct ulp_ddp_limits { + int max_ddp_sgl_len; +}; + +enum ulp_ddp_type { + ULP_DDP_NVME = 1, +}; + +/** + * struct ulp_ddp_config - Generic ulp ddp configuration: tcp ddp IO queue + * config implementations must use this as the first member. + * Add new instances of ulp_ddp_config below (nvme-tcp, etc.). + */ +struct ulp_ddp_config { + enum ulp_ddp_type type; + unsigned char buf[]; +}; + +/** + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue + * + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0) + * @cpda: controller pdu data alignmend (dwords, 0's based) + * @dgst: digest types enabled. + * The netdev will offload crc if ddp_crc is supported. + * @queue_size: number of nvme-tcp IO queue elements + * @queue_id: queue identifier + * @cpu_io: cpu core running the IO thread for this queue + */ +struct nvme_tcp_ddp_config { + struct ulp_ddp_config cfg; + + u16 pfv; + u8 cpda; + u8 dgst; + int queue_size; + int queue_id; + int io_cpu; +}; + +/** + * struct ulp_ddp_io - ulp ddp configuration for an IO request. + * + * @command_id: identifier on the wire associated with these buffers + * @nents: number of entries in the sg_table + * @sg_table: describing the buffers for this IO request + * @first_sgl: first SGL in sg_table + */ +struct ulp_ddp_io { + u32 command_id; + int nents; + struct sg_table sg_table; + struct scatterlist first_sgl[SG_CHUNK_SIZE]; +}; + +/* struct ulp_ddp_dev_ops - operations used by an upper layer protocol to configure ddp offload + * + * @ulp_ddp_limits: limit the number of scatter gather entries per IO. + * the device driver can use this to limit the resources allocated per queue. + * @ulp_ddp_sk_add: add offload for the queue represennted by the socket+config pair. + * this function is used to configure either copy, crc or both offloads. + * @ulp_ddp_sk_del: remove offload from the socket, and release any device related resources. + * @ulp_ddp_setup: request copy offload for buffers associated with a command_id in ulp_ddp_io. + * @ulp_ddp_teardown: release offload resources association between buffers and command_id in + * ulp_ddp_io. + * @ulp_ddp_resync: respond to the driver's resync_request. Called only if resync is successful. + */ +struct ulp_ddp_dev_ops { + int (*ulp_ddp_limits)(struct net_device *netdev, + struct ulp_ddp_limits *limits); + int (*ulp_ddp_sk_add)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_config *config); + void (*ulp_ddp_sk_del)(struct net_device *netdev, + struct sock *sk); + int (*ulp_ddp_setup)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io); + int (*ulp_ddp_teardown)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io, + void *ddp_ctx); + void (*ulp_ddp_resync)(struct net_device *netdev, + struct sock *sk, u32 seq); +}; + +#define ULP_DDP_RESYNC_REQ BIT(0) + +/** + * struct ulp_ddp_ulp_ops - Interface to register uppper layer Direct Data Placement (DDP) TCP offload + */ +struct ulp_ddp_ulp_ops { + /* NIC requests ulp to indicate if @seq is the start of a message */ + bool (*resync_request)(struct sock *sk, u32 seq, u32 flags); + /* NIC driver informs the ulp that ddp teardown is done - used for async completions*/ + void (*ddp_teardown_done)(void *ddp_ctx); +}; + +/** + * struct ulp_ddp_ctx - Generic ulp ddp context: device driver per queue contexts must + * use this as the first member. + */ +struct ulp_ddp_ctx { + enum ulp_ddp_type type; + unsigned char buf[]; +}; + +static inline struct ulp_ddp_ctx *ulp_ddp_get_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + return (__force struct ulp_ddp_ctx *)icsk->icsk_ulp_ddp_data; +} + +static inline void ulp_ddp_set_ctx(struct sock *sk, void *ctx) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx); +} + +#endif //_ULP_DDP_H diff --git a/net/Kconfig b/net/Kconfig index c7392c449b25..b6f0ccbea1e3 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -454,4 +454,14 @@ config ETHTOOL_NETLINK netlink. It provides better extensibility and some new features, e.g. notification messages. +config ULP_DDP + bool "ULP direct data placement offload" + default n + help + Direct Data Placement (DDP) offload enables ULP, such as + NVMe-TCP/iSCSI, to request the NIC to place ULP payload data + of a command response directly into kernel pages while + calculate/verify the data digest on ULP PDU as they go through + the NIC. Thus avoiding the costly per-byte overhead. + endif # if NET diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 12aabcda6db2..20add6c3f2e6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -71,6 +71,7 @@ #include <net/mpls.h> #include <net/mptcp.h> #include <net/page_pool.h> +#include <net/ulp_ddp.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -6295,9 +6296,14 @@ EXPORT_SYMBOL(pskb_extract); */ void skb_condense(struct sk_buff *skb) { + bool is_ddp = false; + +#ifdef CONFIG_ULP_DDP + is_ddp = skb->ddp_crc; +#endif if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || - skb_cloned(skb)) + skb_cloned(skb) || is_ddp) return; /* Nice, we can free page frag(s) right now */ diff --git a/net/ethtool/common.c b/net/ethtool/common.c index f9dcbad84788..d545d1525800 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -73,6 +73,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_HSR_TAG_RM_BIT] = "hsr-tag-rm-offload", [NETIF_F_HW_HSR_FWD_BIT] = "hsr-fwd-offload", [NETIF_F_HW_HSR_DUP_BIT] = "hsr-dup-offload", + [NETIF_F_HW_ULP_DDP_BIT] = "ulp-ddp-offload", }; const char diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e6ca5a1f3b59..4a7160bba09b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5149,6 +5149,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); #ifdef CONFIG_TLS_DEVICE nskb->decrypted = skb->decrypted; +#endif +#ifdef CONFIG_ULP_DDP + nskb->ddp_crc = skb->ddp_crc; #endif TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) @@ -5182,6 +5185,11 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, #ifdef CONFIG_TLS_DEVICE if (skb->decrypted != nskb->decrypted) goto end; +#endif +#ifdef CONFIG_ULP_DDP + + if (skb->ddp_crc != nskb->ddp_crc) + goto end; #endif } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e66ad6bfe808..3d9849a39b82 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1830,6 +1830,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || #ifdef CONFIG_TLS_DEVICE tail->decrypted != skb->decrypted || +#endif +#ifdef CONFIG_ULP_DDP + tail->ddp_crc != skb->ddp_crc || #endif thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index e09147ac9a99..96e8228d2b96 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -262,6 +262,9 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) #ifdef CONFIG_TLS_DEVICE flush |= p->decrypted ^ skb->decrypted; #endif +#ifdef CONFIG_ULP_DDP + flush |= p->ddp_crc ^ skb->ddp_crc; +#endif if (flush || skb_gro_receive(p, skb)) { mss = 1;

[v5,net-next,01/36] net: Introduce direct data placement tcp offload

Checks

Commit Message

Comments

Patch