@@ -53,6 +53,10 @@
#include <net/net_debug.h>
#include <net/dropreason-core.h>
+#ifdef CONFIG_ULP_DDP
+#include <net/ulp_ddp_caps.h>
+#endif
+
struct netpoll_info;
struct device;
struct ethtool_ops;
@@ -1427,6 +1431,8 @@ struct netdev_net_notifier {
* struct kernel_hwtstamp_config *kernel_config,
* struct netlink_ext_ack *extack);
* Change the hardware timestamping parameters for NIC device.
+ * struct ulp_ddp_dev_ops *ulp_ddp_ops;
+ * ULP DDP operations (see include/net/ulp_ddp.h)
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
@@ -1664,6 +1670,9 @@ struct net_device_ops {
int (*ndo_hwtstamp_set)(struct net_device *dev,
struct kernel_hwtstamp_config *kernel_config,
struct netlink_ext_ack *extack);
+#if IS_ENABLED(CONFIG_ULP_DDP)
+ const struct ulp_ddp_dev_ops *ulp_ddp_ops;
+#endif
};
/**
@@ -1835,6 +1844,9 @@ enum netdev_ml_priv_type {
* @mpls_features: Mask of features inheritable by MPLS
* @gso_partial_features: value(s) from NETIF_F_GSO\*
*
+ * @ulp_ddp_caps: Bitflags keeping track of supported and enabled
+ * ULP DDP capabilities.
+ *
* @ifindex: interface index
* @group: The group the device belongs to
*
@@ -2134,6 +2146,9 @@ struct net_device {
netdev_features_t mpls_features;
netdev_features_t gso_partial_features;
+#ifdef CONFIG_ULP_DDP
+ struct ulp_ddp_netdev_caps ulp_ddp_caps;
+#endif
unsigned int min_mtu;
unsigned int max_mtu;
unsigned short type;
@@ -810,6 +810,8 @@ typedef unsigned char *sk_buff_data_t;
* delivery_time in mono clock base (i.e. EDT). Otherwise, the
* skb->tstamp has the (rcv) timestamp at ingress and
* delivery_time at egress.
+ * @no_condense: When set, don't condense fragments (DDP offloaded)
+ * @ulp_crc: CRC offloaded
* @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS
* @alloc_cpu: CPU which did the skb allocation.
@@ -989,7 +991,10 @@ struct sk_buff {
#if IS_ENABLED(CONFIG_IP_SCTP)
__u8 csum_not_inet:1;
#endif
-
+#ifdef CONFIG_ULP_DDP
+ __u8 no_condense:1;
+ __u8 ulp_crc:1;
+#endif
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
__u16 tc_index; /* traffic control index */
#endif
@@ -5063,5 +5068,23 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
ssize_t maxsize, gfp_t gfp);
+static inline bool skb_is_no_condense(struct sk_buff *skb)
+{
+#ifdef CONFIG_ULP_DDP
+ return skb->no_condense;
+#else
+ return 0;
+#endif
+}
+
+static inline bool skb_is_ulp_crc(struct sk_buff *skb)
+{
+#ifdef CONFIG_ULP_DDP
+ return skb->ulp_crc;
+#else
+ return 0;
+#endif
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SKBUFF_H */
@@ -68,6 +68,8 @@ struct inet_connection_sock_af_ops {
* @icsk_ulp_ops Pluggable ULP control hook
* @icsk_ulp_data ULP private data
* @icsk_clean_acked Clean acked data hook
+ * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook
+ * @icsk_ulp_ddp_data ULP direct data placement private data
* @icsk_ca_state: Congestion control state
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
* @icsk_pending: Scheduled timer event
@@ -98,6 +100,10 @@ struct inet_connection_sock {
const struct tcp_ulp_ops *icsk_ulp_ops;
void __rcu *icsk_ulp_data;
void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
+#ifdef CONFIG_ULP_DDP
+ const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops;
+ void __rcu *icsk_ulp_ddp_data;
+#endif
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state:5,
icsk_ca_initialized:1,
new file mode 100644
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * ulp_ddp.h
+ * Author: Boris Pismenny <borisp@nvidia.com>
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+#ifndef _ULP_DDP_H
+#define _ULP_DDP_H
+
+#include <linux/netdevice.h>
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+
+#include "ulp_ddp_caps.h"
+
+enum ulp_ddp_type {
+ ULP_DDP_NVME = 1,
+};
+
+/**
+ * struct nvme_tcp_ddp_limits - nvme tcp driver limitations
+ *
+ * @full_ccid_range: true if the driver supports the full CID range
+ */
+struct nvme_tcp_ddp_limits {
+ bool full_ccid_range;
+};
+
+/**
+ * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp
+ * protocol limits.
+ * Add new instances of ulp_ddp_limits in the union below (nvme-tcp, etc.).
+ *
+ * @type: type of this limits struct
+ * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit)
+ * @io_threshold: minimum payload size required to offload
+ * @tls: support for ULP over TLS
+ * @nvmeotcp: NVMe-TCP specific limits
+ */
+struct ulp_ddp_limits {
+ enum ulp_ddp_type type;
+ int max_ddp_sgl_len;
+ int io_threshold;
+ bool tls:1;
+ union {
+ struct nvme_tcp_ddp_limits nvmeotcp;
+ };
+};
+
+/**
+ * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue
+ *
+ * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0)
+ * @cpda: controller pdu data alignment (dwords, 0's based)
+ * @dgst: digest types enabled (header or data, see
+ * enum nvme_tcp_digest_option).
+ * The netdev will offload crc if it is supported.
+ * @queue_size: number of nvme-tcp IO queue elements
+ * @queue_id: queue identifier
+ */
+struct nvme_tcp_ddp_config {
+ u16 pfv;
+ u8 cpda;
+ u8 dgst;
+ int queue_size;
+ int queue_id;
+};
+
+/**
+ * struct ulp_ddp_config - Generic ulp ddp configuration
+ * Add new instances of ulp_ddp_config in the union below (nvme-tcp, etc.).
+ *
+ * @type: type of this config struct
+ * @nvmeotcp: NVMe-TCP specific config
+ * @io_cpu: cpu core running the IO thread for this socket
+ */
+struct ulp_ddp_config {
+ enum ulp_ddp_type type;
+ int io_cpu;
+ union {
+ struct nvme_tcp_ddp_config nvmeotcp;
+ };
+};
+
+/**
+ * struct ulp_ddp_io - ulp ddp configuration for an IO request.
+ *
+ * @command_id: identifier on the wire associated with these buffers
+ * @nents: number of entries in the sg_table
+ * @sg_table: describing the buffers for this IO request
+ * @first_sgl: first SGL in sg_table
+ */
+struct ulp_ddp_io {
+ u32 command_id;
+ int nents;
+ struct sg_table sg_table;
+ struct scatterlist first_sgl[SG_CHUNK_SIZE];
+};
+
+/**
+ * struct netlink_ulp_ddp_stats - ULP DDP offload statistics
+ * @rx_nvmeotcp_sk_add: number of sockets successfully prepared for offloading.
+ * @rx_nvmeotcp_sk_add_fail: number of sockets that failed to be prepared
+ * for offloading.
+ * @rx_nvmeotcp_sk_del: number of sockets where offloading has been removed.
+ * @rx_nvmeotcp_ddp_setup: number of NVMeTCP PDU successfully prepared for
+ * Direct Data Placement.
+ * @rx_nvmeotcp_ddp_setup_fail: number of PDUs that failed DDP preparation.
+ * @rx_nvmeotcp_ddp_teardown: number of PDUs done with DDP.
+ * @rx_nvmeotcp_drop: number of PDUs dropped.
+ * @rx_nvmeotcp_resync: number of resync.
+ * @rx_nvmeotcp_packets: number of offloaded PDUs.
+ * @rx_nvmeotcp_bytes: number of offloaded bytes.
+ */
+struct netlink_ulp_ddp_stats {
+ u64 rx_nvmeotcp_sk_add;
+ u64 rx_nvmeotcp_sk_add_fail;
+ u64 rx_nvmeotcp_sk_del;
+ u64 rx_nvmeotcp_ddp_setup;
+ u64 rx_nvmeotcp_ddp_setup_fail;
+ u64 rx_nvmeotcp_ddp_teardown;
+ u64 rx_nvmeotcp_drop;
+ u64 rx_nvmeotcp_resync;
+ u64 rx_nvmeotcp_packets;
+ u64 rx_nvmeotcp_bytes;
+
+ /*
+ * add new stats at the end and keep in sync with
+ * Documentation/netlink/specs/ulp_ddp.yaml
+ */
+};
+
+struct netlink_ext_ack;
+
+/**
+ * struct ulp_ddp_dev_ops - operations used by an upper layer protocol
+ * to configure ddp offload
+ *
+ * @limits: query ulp driver limitations and quirks.
+ * @sk_add: add offload for the queue represented by socket+config
+ * pair. this function is used to configure either copy, crc
+ * or both offloads.
+ * @sk_del: remove offload from the socket, and release any device
+ * related resources.
+ * @setup: request copy offload for buffers associated with a
+ * command_id in ulp_ddp_io.
+ * @teardown: release offload resources association between buffers
+ * and command_id in ulp_ddp_io.
+ * @resync: respond to the driver's resync_request. Called only if
+ * resync is successful.
+ * @set_caps: set device ULP DDP capabilities.
+ * returns a negative error code or zero.
+ * @get_stats: query ULP DDP statistics.
+ */
+struct ulp_ddp_dev_ops {
+ int (*limits)(struct net_device *netdev,
+ struct ulp_ddp_limits *limits);
+ int (*sk_add)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_config *config);
+ void (*sk_del)(struct net_device *netdev,
+ struct sock *sk);
+ int (*setup)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_io *io);
+ void (*teardown)(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_io *io,
+ void *ddp_ctx);
+ void (*resync)(struct net_device *netdev,
+ struct sock *sk, u32 seq);
+ int (*set_caps)(struct net_device *dev, unsigned long *bits,
+ struct netlink_ext_ack *extack);
+ int (*get_stats)(struct net_device *dev,
+ struct netlink_ulp_ddp_stats *stats);
+};
+
+#define ULP_DDP_RESYNC_PENDING BIT(0)
+
+/**
+ * struct ulp_ddp_ulp_ops - Interface to register upper layer
+ * Direct Data Placement (DDP) TCP offload.
+ * @resync_request: NIC requests ulp to indicate if @seq is the start
+ * of a message.
+ * @ddp_teardown_done: NIC driver informs the ulp that teardown is done,
+ * used for async completions.
+ */
+struct ulp_ddp_ulp_ops {
+ bool (*resync_request)(struct sock *sk, u32 seq, u32 flags);
+ void (*ddp_teardown_done)(void *ddp_ctx);
+};
+
+/**
+ * struct ulp_ddp_ctx - Generic ulp ddp context
+ *
+ * @type: type of this context struct
+ * @buf: protocol-specific context struct
+ */
+struct ulp_ddp_ctx {
+ enum ulp_ddp_type type;
+ unsigned char buf[];
+};
+
+static inline struct ulp_ddp_ctx *ulp_ddp_get_ctx(const struct sock *sk)
+{
+#ifdef CONFIG_ULP_DDP
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return (__force struct ulp_ddp_ctx *)icsk->icsk_ulp_ddp_data;
+#else
+ return NULL;
+#endif
+}
+
+static inline void ulp_ddp_set_ctx(struct sock *sk, void *ctx)
+{
+#ifdef CONFIG_ULP_DDP
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx);
+#endif
+}
+
+static inline int ulp_ddp_setup(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_io *io)
+{
+#ifdef CONFIG_ULP_DDP
+ return netdev->netdev_ops->ulp_ddp_ops->setup(netdev, sk, io);
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static inline void ulp_ddp_teardown(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_io *io,
+ void *ddp_ctx)
+{
+#ifdef CONFIG_ULP_DDP
+ netdev->netdev_ops->ulp_ddp_ops->teardown(netdev, sk, io, ddp_ctx);
+#endif
+}
+
+static inline void ulp_ddp_resync(struct net_device *netdev,
+ struct sock *sk,
+ u32 seq)
+{
+#ifdef CONFIG_ULP_DDP
+ netdev->netdev_ops->ulp_ddp_ops->resync(netdev, sk, seq);
+#endif
+}
+
+#ifdef CONFIG_ULP_DDP
+
+int ulp_ddp_sk_add(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_config *config,
+ const struct ulp_ddp_ulp_ops *ops);
+
+void ulp_ddp_sk_del(struct net_device *netdev,
+ struct sock *sk);
+
+bool ulp_ddp_query_limits(struct net_device *netdev,
+ struct ulp_ddp_limits *limits,
+ enum ulp_ddp_type type,
+ int cap_bit_nr,
+ bool tls);
+
+#else
+
+static inline int ulp_ddp_sk_add(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_config *config,
+ const struct ulp_ddp_ulp_ops *ops)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void ulp_ddp_sk_del(struct net_device *netdev,
+ struct sock *sk)
+{}
+
+static inline bool ulp_ddp_query_limits(struct net_device *netdev,
+ struct ulp_ddp_limits *limits,
+ enum ulp_ddp_type type,
+ int cap_bit_nr,
+ bool tls)
+{
+ return false;
+}
+
+#endif
+
+#endif /* _ULP_DDP_H */
new file mode 100644
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * ulp_ddp.h
+ * Author: Aurelien Aptel <aaptel@nvidia.com>
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+#ifndef _ULP_DDP_CAPS_H
+#define _ULP_DDP_CAPS_H
+
+#include <linux/types.h>
+
+enum {
+ ULP_DDP_C_NVME_TCP_BIT,
+ ULP_DDP_C_NVME_TCP_DDGST_RX_BIT,
+
+ /*
+ * add capabilities above and keep in sync with
+ * Documentation/netlink/specs/ulp_ddp.yaml
+ */
+ ULP_DDP_C_COUNT,
+};
+
+struct ulp_ddp_netdev_caps {
+ DECLARE_BITMAP(active, ULP_DDP_C_COUNT);
+ DECLARE_BITMAP(hw, ULP_DDP_C_COUNT);
+};
+
+static inline bool ulp_ddp_cap_turned_on(unsigned long *old,
+ unsigned long *new,
+ int bit_nr)
+{
+ return !test_bit(bit_nr, old) && test_bit(bit_nr, new);
+}
+
+static inline bool ulp_ddp_cap_turned_off(unsigned long *old,
+ unsigned long *new,
+ int bit_nr)
+{
+ return test_bit(bit_nr, old) && !test_bit(bit_nr, new);
+}
+
+#endif
@@ -517,4 +517,24 @@ config NET_TEST
If unsure, say N.
+config ULP_DDP
+ bool "ULP direct data placement offload"
+ help
+ This feature provides a generic infrastructure for Direct
+ Data Placement (DDP) offload for Upper Layer Protocols (ULP,
+ such as NVMe-TCP).
+
+ If the ULP and NIC driver supports it, the ULP code can
+ request the NIC to place ULP response data directly
+ into application memory, avoiding a costly copy.
+
+ This infrastructure also allows for offloading the ULP data
+ integrity checks (e.g. data digest) that would otherwise
+ require another costly pass on the data we managed to avoid
+ copying.
+
+ For more information, see
+ <file:Documentation/networking/ulp-ddp-offload.rst>.
+
+
endif # if NET
@@ -18,6 +18,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
+obj-$(CONFIG_ULP_DDP) += ulp_ddp.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
@@ -75,6 +75,7 @@
#include <net/mctp.h>
#include <net/page_pool/helpers.h>
#include <net/dropreason.h>
+#include <net/ulp_ddp.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -6605,7 +6606,7 @@ void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
- skb_cloned(skb))
+ skb_cloned(skb) || skb_is_no_condense(skb))
return;
/* Nice, we can free page frag(s) right now */
new file mode 100644
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * ulp_ddp.h
+ * Author: Aurelien Aptel <aaptel@nvidia.com>
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <net/ulp_ddp.h>
+
+int ulp_ddp_sk_add(struct net_device *netdev,
+ struct sock *sk,
+ struct ulp_ddp_config *config,
+ const struct ulp_ddp_ulp_ops *ops)
+{
+ int ret;
+
+ /* put in ulp_ddp_sk_del() */
+ dev_hold(netdev);
+
+ config->io_cpu = sk->sk_incoming_cpu;
+ ret = netdev->netdev_ops->ulp_ddp_ops->sk_add(netdev, sk, config);
+ if (ret) {
+ dev_put(netdev);
+ return ret;
+ }
+
+ inet_csk(sk)->icsk_ulp_ddp_ops = ops;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ulp_ddp_sk_add);
+
+void ulp_ddp_sk_del(struct net_device *netdev,
+ struct sock *sk)
+{
+ netdev->netdev_ops->ulp_ddp_ops->sk_del(netdev, sk);
+ inet_csk(sk)->icsk_ulp_ddp_ops = NULL;
+ dev_put(netdev);
+}
+EXPORT_SYMBOL_GPL(ulp_ddp_sk_del);
+
+bool ulp_ddp_query_limits(struct net_device *netdev,
+ struct ulp_ddp_limits *limits,
+ enum ulp_ddp_type type,
+ int cap_bit_nr,
+ bool tls)
+{
+ int ret;
+
+ if (!netdev->netdev_ops->ulp_ddp_ops->limits)
+ return false;
+
+ limits->type = type;
+ ret = netdev->netdev_ops->ulp_ddp_ops->limits(netdev, limits);
+ if (ret == -EOPNOTSUPP ||
+ !test_bit(cap_bit_nr, netdev->ulp_ddp_caps.active) ||
+ (tls && !limits->tls)) {
+ return false;
+ } else if (ret) {
+ WARN_ONCE(ret, "ddp limits failed (ret=%d)", ret);
+ return false;
+ }
+
+ dev_dbg_ratelimited(&netdev->dev,
+ "netdev %s offload limits: max_ddp_sgl_len %d\n",
+ netdev->name, limits->max_ddp_sgl_len);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ulp_ddp_query_limits);
@@ -4740,7 +4740,10 @@ static bool tcp_try_coalesce(struct sock *sk,
if (from->decrypted != to->decrypted)
return false;
#endif
-
+#ifdef CONFIG_ULP_DDP
+ if (skb_is_ulp_crc(from) != skb_is_ulp_crc(to))
+ return false;
+#endif
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
@@ -5310,6 +5313,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
#ifdef CONFIG_TLS_DEVICE
nskb->decrypted = skb->decrypted;
+#endif
+#ifdef CONFIG_ULP_DDP
+ nskb->no_condense = skb->no_condense;
+ nskb->ulp_crc = skb->ulp_crc;
#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
@@ -5343,6 +5350,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
#ifdef CONFIG_TLS_DEVICE
if (skb->decrypted != nskb->decrypted)
goto end;
+#endif
+#ifdef CONFIG_ULP_DDP
+ if (skb_is_ulp_crc(skb) != skb_is_ulp_crc(nskb))
+ goto end;
#endif
}
}
@@ -1872,6 +1872,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
tail->decrypted != skb->decrypted ||
#endif
!mptcp_skb_can_collapse(tail, skb) ||
+#ifdef CONFIG_ULP_DDP
+ skb_is_ulp_crc(tail) != skb_is_ulp_crc(skb) ||
+#endif
thtail->doff != th->doff ||
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
goto no_coalesce;
@@ -268,6 +268,9 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
#ifdef CONFIG_TLS_DEVICE
flush |= p->decrypted ^ skb->decrypted;
#endif
+#ifdef CONFIG_ULP_DDP
+ flush |= skb_is_ulp_crc(p) ^ skb_is_ulp_crc(skb);
+#endif
if (flush || skb_gro_receive(p, skb)) {
mss = 1;