@@ -52,6 +52,10 @@
#include <net/net_trackers.h>
#include <net/net_debug.h>
+#ifdef CONFIG_ULP_DDP
+#include <net/ulp_ddp_caps.h>
+#endif
+
struct netpoll_info;
struct device;
struct ethtool_ops;
@@ -1392,6 +1396,8 @@ struct netdev_net_notifier {
* Get hardware timestamp based on normal/adjustable time or free running
* cycle counter. This function is required if physical clock supports a
* free running cycle counter.
+ * struct ulp_ddp_dev_ops *ulp_ddp_ops;
+ * ULP DDP operations (see include/net/ulp_ddp.h)
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
@@ -1616,6 +1622,9 @@ struct net_device_ops {
ktime_t (*ndo_get_tstamp)(struct net_device *dev,
const struct skb_shared_hwtstamps *hwtstamps,
bool cycles);
+#if IS_ENABLED(CONFIG_ULP_DDP)
+ const struct ulp_ddp_dev_ops *ulp_ddp_ops;
+#endif
};
/**
@@ -2071,6 +2080,9 @@ struct net_device {
netdev_features_t mpls_features;
netdev_features_t gso_partial_features;
+#ifdef CONFIG_ULP_DDP
+ struct ulp_ddp_netdev_caps ulp_ddp_caps;
+#endif
unsigned int min_mtu;
unsigned int max_mtu;
unsigned short type;
@@ -811,6 +811,8 @@ typedef unsigned char *sk_buff_data_t;
* delivery_time in mono clock base (i.e. EDT). Otherwise, the
* skb->tstamp has the (rcv) timestamp at ingress and
* delivery_time at egress.
+ * @ulp_ddp: DDP offloaded
+ * @ulp_crc: CRC offloaded
* @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS
* @alloc_cpu: CPU which did the skb allocation.
@@ -983,6 +985,10 @@ struct sk_buff {
__u8 slow_gro:1;
__u8 csum_not_inet:1;
__u8 scm_io_uring:1;
+#ifdef CONFIG_ULP_DDP
+ __u8 ulp_ddp:1;
+ __u8 ulp_crc:1;
+#endif
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
@@ -5053,5 +5059,23 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
}
#endif
+static inline bool skb_is_ulp_ddp(struct sk_buff *skb)
+{
+#ifdef CONFIG_ULP_DDP
+ return skb->ulp_ddp;
+#else
+ return 0;
+#endif
+}
+
+static inline bool skb_is_ulp_crc(struct sk_buff *skb)
+{
+#ifdef CONFIG_ULP_DDP
+ return skb->ulp_crc;
+#else
+ return 0;
+#endif
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SKBUFF_H */
@@ -68,6 +68,8 @@ struct inet_connection_sock_af_ops {
* @icsk_ulp_ops Pluggable ULP control hook
* @icsk_ulp_data ULP private data
* @icsk_clean_acked Clean acked data hook
+ * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook
+ * @icsk_ulp_ddp_data ULP direct data placement private data
* @icsk_ca_state: Congestion control state
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
* @icsk_pending: Scheduled timer event
@@ -98,6 +100,8 @@ struct inet_connection_sock {
const struct tcp_ulp_ops *icsk_ulp_ops;
void __rcu *icsk_ulp_data;
void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
+ const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops;
+ void __rcu *icsk_ulp_ddp_data;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state:5,
icsk_ca_initialized:1,
new file mode 100644
@@ -0,0 +1,173 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * ulp_ddp.h
+ * Author: Boris Pismenny <borisp@nvidia.com>
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+#ifndef _ULP_DDP_H
+#define _ULP_DDP_H
+
+#include <linux/netdevice.h>
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+
+#include "ulp_ddp_caps.h"
+
+enum ulp_ddp_type {
+ ULP_DDP_NVME = 1,
+};
+
+/**
+ * struct nvme_tcp_ddp_limits - nvme tcp driver limitations
+ *
+ * @full_ccid_range: true if the driver supports the full CID range
+ */
+struct nvme_tcp_ddp_limits {
+ bool full_ccid_range;
+};
+
+/**
+ * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp
+ * protocol limits.
+ * Add new instances of ulp_ddp_limits in the union below (nvme-tcp, etc.).
+ *
+ * @type: type of this limits struct
+ * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit)
+ * @io_threshold: minimum payload size required to offload
+ * @nvmeotcp: NVMe-TCP specific limits
+ */
+struct ulp_ddp_limits {
+ enum ulp_ddp_type type;
+ int max_ddp_sgl_len;
+ int io_threshold;
+ union {
+ struct nvme_tcp_ddp_limits nvmeotcp;
+ };
+};
+
+/**
+ * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue
+ *
+ * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0)
+ * @cpda: controller pdu data alignment (dwords, 0's based)
+ * @dgst: digest types enabled (header or data, see enum nvme_tcp_digest_option).
+ * The netdev will offload crc if it is supported.
+ * @queue_size: number of nvme-tcp IO queue elements
+ * @queue_id: queue identifier
+ * @io_cpu: cpu core running the IO thread for this queue
+ */
+struct nvme_tcp_ddp_config {
+ u16 pfv;
+ u8 cpda;
+ u8 dgst;
+ int queue_size;
+ int queue_id;
+ int io_cpu;
+};
+
+/**
+ * struct ulp_ddp_config - Generic ulp ddp configuration
+ * Add new instances of ulp_ddp_config in the union below (nvme-tcp, etc.).
+ *
+ * @type: type of this config struct
+ * @nvmeotcp: NVMe-TCP specific config
+ */
+struct ulp_ddp_config {
+ enum ulp_ddp_type type;
+ union {
+ struct nvme_tcp_ddp_config nvmeotcp;
+ };
+};
+
+/**
+ * struct ulp_ddp_io - ulp ddp configuration for an IO request.
+ *
+ * @command_id: identifier on the wire associated with these buffers
+ * @nents: number of entries in the sg_table
+ * @sg_table: describing the buffers for this IO request
+ * @first_sgl: first SGL in sg_table
+ */
+struct ulp_ddp_io {
+ u32 command_id;
+ int nents;
+ struct sg_table sg_table;
+ struct scatterlist first_sgl[SG_CHUNK_SIZE];
+};
+
+/**
+ * struct ulp_ddp_dev_ops - operations used by an upper layer protocol
+ * to configure ddp offload
+ *
+ * @ulp_ddp_limits: query ulp driver limitations and quirks.
+ * @ulp_ddp_sk_add: add offload for the queue represented by socket+config
+ * pair. this function is used to configure either copy, crc
+ * or both offloads.
+ * @ulp_ddp_sk_del: remove offload from the socket, and release any device
+ * related resources.
+ * @ulp_ddp_setup: request copy offload for buffers associated with a
+ * command_id in ulp_ddp_io.
+ * @ulp_ddp_teardown: release offload resources association between buffers
+ * and command_id in ulp_ddp_io.
+ * @ulp_ddp_resync: respond to the driver's resync_request. Called only if
+ * resync is successful.
+ */
+struct ulp_ddp_dev_ops {
+ int (*ulp_ddp_limits)(struct net_device *netdev,
+ struct ulp_ddp_limits *limits);
+ int (*ulp_ddp_sk_add)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_config *config);
+ void (*ulp_ddp_sk_del)(struct net_device *netdev,
+ struct sock *sk);
+ int (*ulp_ddp_setup)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_io *io);
+ void (*ulp_ddp_teardown)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_io *io,
+ void *ddp_ctx);
+ void (*ulp_ddp_resync)(struct net_device *netdev,
+ struct sock *sk, u32 seq);
+};
+
+#define ULP_DDP_RESYNC_PENDING BIT(0)
+
+/**
+ * struct ulp_ddp_ulp_ops - Interface to register upper layer
+ * Direct Data Placement (DDP) TCP offload.
+ * @resync_request: NIC requests ulp to indicate if @seq is the start
+ * of a message.
+ * @ddp_teardown_done: NIC driver informs the ulp that teardown is done,
+ * used for async completions.
+ */
+struct ulp_ddp_ulp_ops {
+ bool (*resync_request)(struct sock *sk, u32 seq, u32 flags);
+ void (*ddp_teardown_done)(void *ddp_ctx);
+};
+
+/**
+ * struct ulp_ddp_ctx - Generic ulp ddp context
+ *
+ * @type: type of this context struct
+ * @buf: protocol-specific context struct
+ */
+struct ulp_ddp_ctx {
+ enum ulp_ddp_type type;
+ unsigned char buf[];
+};
+
+static inline struct ulp_ddp_ctx *ulp_ddp_get_ctx(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return (__force struct ulp_ddp_ctx *)icsk->icsk_ulp_ddp_data;
+}
+
+static inline void ulp_ddp_set_ctx(struct sock *sk, void *ctx)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx);
+}
+
+#endif /* _ULP_DDP_H */
new file mode 100644
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * ulp_ddp.h
+ * Author: Aurelien Aptel <aaptel@nvidia.com>
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+#ifndef _ULP_DDP_CAPS_H
+#define _ULP_DDP_CAPS_H
+
+#include <linux/types.h>
+
+enum {
+ ULP_DDP_C_NVME_TCP_BIT,
+ ULP_DDP_C_NVME_TCP_DDGST_RX_BIT,
+
+ /* add capabilities above */
+ ULP_DDP_C_COUNT,
+};
+
+#define __ULP_DDP_C_BIT(bit) ((u64)1 << (bit))
+#define __ULP_DDP_C(name) __ULP_DDP_C_BIT(ULP_DDP_C_##name##_BIT)
+
+#define ULP_DDP_C_NVME_TCP __ULP_DDP_C(NVME_TCP)
+#define ULP_DDP_C_NVME_TCP_DDGST_RX __ULP_DDP_C(NVME_TCP)
+
+struct ulp_ddp_netdev_caps {
+ DECLARE_BITMAP(active, ULP_DDP_C_COUNT);
+ DECLARE_BITMAP(hw, ULP_DDP_C_COUNT);
+};
+
+static inline bool ulp_ddp_cap_turned_on(unsigned long *old, unsigned long *new, int bit_nr)
+{
+ return !test_bit(bit_nr, old) && test_bit(bit_nr, new);
+}
+
+static inline bool ulp_ddp_cap_turned_off(unsigned long *old, unsigned long *new, int bit_nr)
+{
+ return test_bit(bit_nr, old) && !test_bit(bit_nr, new);
+}
+
+#endif
@@ -471,4 +471,24 @@ config NETDEV_ADDR_LIST_TEST
default KUNIT_ALL_TESTS
depends on KUNIT
+config ULP_DDP
+ bool "ULP direct data placement offload"
+ help
+ This feature provides a generic infrastructure for Direct
+ Data Placement (DDP) offload for Upper Layer Protocols (ULP,
+ such as NVMe-TCP).
+
+ If the ULP and NIC driver supports it, the ULP code can
+ request the NIC to place ULP response data directly
+ into application memory, avoiding a costly copy.
+
+ This infrastructure also allows for offloading the ULP data
+ integrity checks (e.g. data digest) that would otherwise
+ require another costly pass on the data we managed to avoid
+ copying.
+
+ For more information, see
+ <file:Documentation/networking/ulp-ddp-offload.rst>.
+
+
endif # if NET
@@ -72,6 +72,7 @@
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool.h>
+#include <net/ulp_ddp.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -6476,7 +6477,7 @@ void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
- skb_cloned(skb))
+ skb_cloned(skb) || skb_is_ulp_ddp(skb))
return;
/* Nice, we can free page frag(s) right now */
@@ -5234,6 +5234,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
#ifdef CONFIG_TLS_DEVICE
nskb->decrypted = skb->decrypted;
+#endif
+#ifdef CONFIG_ULP_DDP
+ nskb->ulp_ddp = skb->ulp_ddp;
+ nskb->ulp_crc = skb->ulp_crc;
#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
@@ -5267,6 +5271,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
#ifdef CONFIG_TLS_DEVICE
if (skb->decrypted != nskb->decrypted)
goto end;
+#endif
+#ifdef CONFIG_ULP_DDP
+ if (skb_is_ulp_crc(skb) != skb_is_ulp_crc(nskb))
+ goto end;
#endif
}
}
@@ -1861,6 +1861,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
#ifdef CONFIG_TLS_DEVICE
tail->decrypted != skb->decrypted ||
+#endif
+#ifdef CONFIG_ULP_DDP
+ skb_is_ulp_crc(tail) != skb_is_ulp_crc(skb) ||
#endif
thtail->doff != th->doff ||
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
@@ -268,6 +268,9 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
#ifdef CONFIG_TLS_DEVICE
flush |= p->decrypted ^ skb->decrypted;
#endif
+#ifdef CONFIG_ULP_DDP
+ flush |= skb_is_ulp_crc(p) ^ skb_is_ulp_crc(skb);
+#endif
if (flush || skb_gro_receive(p, skb)) {
mss = 1;