diff mbox series

[v5,net-next,01/36] net: Introduce direct data placement tcp offload

Message ID 20210722110325.371-2-borisp@nvidia.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series nvme-tcp receive and tarnsmit offloads | expand

Checks

Context Check Description
netdev/apply fail Patch does not apply to net-next
netdev/tree_selection success Clearly marked for net-next

Commit Message

Boris Pismenny July 22, 2021, 11:02 a.m. UTC
From: Boris Pismenny <borisp@mellanox.com>

This commit introduces direct data placement offload for TCP.
This capability is accompanied by new net_device operations that
configure hardware contexts. There is a context per socket, and a context per DDP
opreation. Additionally, a resynchronization routine is used to assist
hardware handle TCP OOO, and continue the offload.
Furthermore, we let the offloading driver advertise what is the max hw
sectors/segments.

Using this interface, the NIC hardware will scatter TCP payload directly
to the BIO pages according to the command_id.
To maintain the correctness of the network stack, the driver is expected
to construct SKBs that point to the BIO pages.

The SKB passed to the network stack from the driver
represents data as it is on the wire, while it is pointing
directly to data in destination buffers.
As a result, data from page frags should not be copied out to
the linear part. To avoid needless copies, such as when using
skb_condense, we mark the skb->ddp_crc bit. This bit will be
used to indicate both ddp and crc offload (next patch in series).

A follow-up patch will use this interface for DDP in NVMe-TCP.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Ben Ben-Ishay <benishay@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Yoray Zack <yorayz@mellanox.com>
---
 include/linux/netdev_features.h    |   3 +-
 include/linux/netdevice.h          |   5 ++
 include/linux/skbuff.h             |   4 +
 include/net/inet_connection_sock.h |   4 +
 include/net/ulp_ddp.h              | 136 +++++++++++++++++++++++++++++
 net/Kconfig                        |  10 +++
 net/core/skbuff.c                  |   8 +-
 net/ethtool/common.c               |   1 +
 net/ipv4/tcp_input.c               |   8 ++
 net/ipv4/tcp_ipv4.c                |   3 +
 net/ipv4/tcp_offload.c             |   3 +
 11 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 include/net/ulp_ddp.h

Comments

Eric Dumazet July 22, 2021, 11:26 a.m. UTC | #1
On Thu, Jul 22, 2021 at 1:04 PM Boris Pismenny <borisp@nvidia.com> wrote:
>
> From: Boris Pismenny <borisp@mellanox.com>
>
>
...

>  };
>
>  const char
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index e6ca5a1f3b59..4a7160bba09b 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -5149,6 +5149,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
>                 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
>  #ifdef CONFIG_TLS_DEVICE
>                 nskb->decrypted = skb->decrypted;
> +#endif
> +#ifdef CONFIG_ULP_DDP
> +               nskb->ddp_crc = skb->ddp_crc;

Probably you do not want to attempt any collapse if skb->ddp_crc is
set right there.

>  #endif
>                 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
>                 if (list)
> @@ -5182,6 +5185,11 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
>  #ifdef CONFIG_TLS_DEVICE
>                                 if (skb->decrypted != nskb->decrypted)
>                                         goto end;
> +#endif
> +#ifdef CONFIG_ULP_DDP
> +
> +                               if (skb->ddp_crc != nskb->ddp_crc)

This checks only the second, third, and remaining skbs.

> +                                       goto end;
>  #endif
>                         }
>                 }


tcp_collapse() is copying data from small skbs to pack it to bigger
skb (one page of payload), in case
of memory emergency/pressure (socket queues are full)

If your changes are trying to avoid 'needless'  copies, maybe you
should reconsider and let the emergency packing be done.

If the copy is not _possible_, you should rephrase your changelog to
clearly state the kernel _cannot_ access this memory in any way.
Boris Pismenny July 22, 2021, 12:18 p.m. UTC | #2
On 22/07/2021 14:26, Eric Dumazet wrote:
> On Thu, Jul 22, 2021 at 1:04 PM Boris Pismenny <borisp@nvidia.com> wrote:
>>
>> From: Boris Pismenny <borisp@mellanox.com>
>>
>>
> ...
> 
>>  };
>>
>>  const char
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index e6ca5a1f3b59..4a7160bba09b 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -5149,6 +5149,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
>>                 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
>>  #ifdef CONFIG_TLS_DEVICE
>>                 nskb->decrypted = skb->decrypted;
>> +#endif
>> +#ifdef CONFIG_ULP_DDP
>> +               nskb->ddp_crc = skb->ddp_crc;
> 
> Probably you do not want to attempt any collapse if skb->ddp_crc is
> set right there.
> 

Right.

>>  #endif
>>                 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
>>                 if (list)
>> @@ -5182,6 +5185,11 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
>>  #ifdef CONFIG_TLS_DEVICE
>>                                 if (skb->decrypted != nskb->decrypted)
>>                                         goto end;
>> +#endif
>> +#ifdef CONFIG_ULP_DDP
>> +
>> +                               if (skb->ddp_crc != nskb->ddp_crc)
> 
> This checks only the second, third, and remaining skbs.
> 

Right, as we handle the head skb above. Could you clarify?

>> +                                       goto end;
>>  #endif
>>                         }
>>                 }
> 
> 
> tcp_collapse() is copying data from small skbs to pack it to bigger
> skb (one page of payload), in case
> of memory emergency/pressure (socket queues are full)
> 
> If your changes are trying to avoid 'needless'  copies, maybe you
> should reconsider and let the emergency packing be done.
> 
> If the copy is not _possible_, you should rephrase your changelog to
> clearly state the kernel _cannot_ access this memory in any way.
> 

The issue is that skb_condense also gets called on many skbs in
tcp_add_backlog and it will identify skbs that went through DDP as ideal
for packing, even though they are not small and packing is
counter-productive as data already resides in its destination.

As mentioned above, it is possible to copy, but it is counter-productive
in this case. If there was a real need to access this memory, then it is
allowed.
Eric Dumazet July 22, 2021, 1:10 p.m. UTC | #3
On Thu, Jul 22, 2021 at 2:18 PM Boris Pismenny <borispismenny@gmail.com> wrote:
>
> On 22/07/2021 14:26, Eric Dumazet wrote:
> > On Thu, Jul 22, 2021 at 1:04 PM Boris Pismenny <borisp@nvidia.com> wrote:
> >>
> >> From: Boris Pismenny <borisp@mellanox.com>
> >>
> >>
> > ...
> >
> >>  };
> >>
> >>  const char
> >> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> >> index e6ca5a1f3b59..4a7160bba09b 100644
> >> --- a/net/ipv4/tcp_input.c
> >> +++ b/net/ipv4/tcp_input.c
> >> @@ -5149,6 +5149,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
> >>                 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
> >>  #ifdef CONFIG_TLS_DEVICE
> >>                 nskb->decrypted = skb->decrypted;
> >> +#endif
> >> +#ifdef CONFIG_ULP_DDP
> >> +               nskb->ddp_crc = skb->ddp_crc;
> >
> > Probably you do not want to attempt any collapse if skb->ddp_crc is
> > set right there.
> >
>
> Right.
>
> >>  #endif
> >>                 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
> >>                 if (list)
> >> @@ -5182,6 +5185,11 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
> >>  #ifdef CONFIG_TLS_DEVICE
> >>                                 if (skb->decrypted != nskb->decrypted)
> >>                                         goto end;
> >> +#endif
> >> +#ifdef CONFIG_ULP_DDP
> >> +
> >> +                               if (skb->ddp_crc != nskb->ddp_crc)
> >
> > This checks only the second, third, and remaining skbs.
> >
>
> Right, as we handle the head skb above. Could you clarify?

I was simply saying you missed the first skb.

>
> >> +                                       goto end;
> >>  #endif
> >>                         }
> >>                 }
> >
> >
> > tcp_collapse() is copying data from small skbs to pack it to bigger
> > skb (one page of payload), in case
> > of memory emergency/pressure (socket queues are full)
> >
> > If your changes are trying to avoid 'needless'  copies, maybe you
> > should reconsider and let the emergency packing be done.
> >
> > If the copy is not _possible_, you should rephrase your changelog to
> > clearly state the kernel _cannot_ access this memory in any way.
> >
>
> The issue is that skb_condense also gets called on many skbs in
> tcp_add_backlog and it will identify skbs that went through DDP as ideal
> for packing, even though they are not small and packing is
> counter-productive as data already resides in its destination.
>
> As mentioned above, it is possible to copy, but it is counter-productive
> in this case. If there was a real need to access this memory, then it is
> allowed.

Standard GRO packets from high perf drivers have no room in their
skb->head (ie skb_tailroom() should be 0)

If you have a driver using GRO and who pulled some payload in
skb->head, it is already too late for DDP.

So I think you are trying to add code in TCP that should not be
needed. Perhaps mlx5 driver is doing something it should not ?
(If this is ' copybreak'  this has been documented as being
suboptimal, transports have better strategies)

Secondly, tcp_collapse() should absolutely not be called under regular
workloads.

Trying to optimize this last-resort thing is a lost cause:
If an application is dumb enough to send small packets back-to-back,
it should be fixed (sender side has this thing called autocork, for
applications that do not know about MSG_MORE or TC_CORK.)

(tcp_collapse is a severe source of latencies)
Boris Pismenny July 22, 2021, 1:33 p.m. UTC | #4
On 22/07/2021 16:10, Eric Dumazet wrote:
> On Thu, Jul 22, 2021 at 2:18 PM Boris Pismenny <borispismenny@gmail.com> wrote:
>>
>> On 22/07/2021 14:26, Eric Dumazet wrote:
>>> On Thu, Jul 22, 2021 at 1:04 PM Boris Pismenny <borisp@nvidia.com> wrote:
>>>>
>>>> From: Boris Pismenny <borisp@mellanox.com>
>>>>
>>>>
>>> ...
>>>
>>>>  };
>>>>
>>>>  const char
>>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>>> index e6ca5a1f3b59..4a7160bba09b 100644
>>>> --- a/net/ipv4/tcp_input.c
>>>> +++ b/net/ipv4/tcp_input.c
>>>> @@ -5149,6 +5149,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
>>>>                 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
>>>>  #ifdef CONFIG_TLS_DEVICE
>>>>                 nskb->decrypted = skb->decrypted;
>>>> +#endif
>>>> +#ifdef CONFIG_ULP_DDP
>>>> +               nskb->ddp_crc = skb->ddp_crc;
>>>
>>> Probably you do not want to attempt any collapse if skb->ddp_crc is
>>> set right there.
>>>
>>
>> Right.
>>
>>>>  #endif
>>>>                 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
>>>>                 if (list)
>>>> @@ -5182,6 +5185,11 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
>>>>  #ifdef CONFIG_TLS_DEVICE
>>>>                                 if (skb->decrypted != nskb->decrypted)
>>>>                                         goto end;
>>>> +#endif
>>>> +#ifdef CONFIG_ULP_DDP
>>>> +
>>>> +                               if (skb->ddp_crc != nskb->ddp_crc)
>>>
>>> This checks only the second, third, and remaining skbs.
>>>
>>
>> Right, as we handle the head skb above. Could you clarify?
> 
> I was simply saying you missed the first skb.
> 

But, the first SKB got handled in the change above. The code here is the
same for TLS, if it is wrong, then we already have an issue here.

>>
>>>> +                                       goto end;
>>>>  #endif
>>>>                         }
>>>>                 }
>>>
>>>
>>> tcp_collapse() is copying data from small skbs to pack it to bigger
>>> skb (one page of payload), in case
>>> of memory emergency/pressure (socket queues are full)
>>>
>>> If your changes are trying to avoid 'needless'  copies, maybe you
>>> should reconsider and let the emergency packing be done.
>>>
>>> If the copy is not _possible_, you should rephrase your changelog to
>>> clearly state the kernel _cannot_ access this memory in any way.
>>>
>>
>> The issue is that skb_condense also gets called on many skbs in
>> tcp_add_backlog and it will identify skbs that went through DDP as ideal
>> for packing, even though they are not small and packing is
>> counter-productive as data already resides in its destination.
>>
>> As mentioned above, it is possible to copy, but it is counter-productive
>> in this case. If there was a real need to access this memory, then it is
>> allowed.
> 
> Standard GRO packets from high perf drivers have no room in their
> skb->head (ie skb_tailroom() should be 0)
> 
> If you have a driver using GRO and who pulled some payload in
> skb->head, it is already too late for DDP.
> 
> So I think you are trying to add code in TCP that should not be
> needed. Perhaps mlx5 driver is doing something it should not ?
> (If this is ' copybreak'  this has been documented as being
> suboptimal, transports have better strategies)
> 
> Secondly, tcp_collapse() should absolutely not be called under regular
> workloads.
> 
> Trying to optimize this last-resort thing is a lost cause:
> If an application is dumb enough to send small packets back-to-back,
> it should be fixed (sender side has this thing called autocork, for
> applications that do not know about MSG_MORE or TC_CORK.)
> 
> (tcp_collapse is a severe source of latencies)
> 


Sorry. My response above was about skb_condense which I've confused with
tcp_collapse.

In tcp_collapse, we could allow the copy, but the problem is CRC, which
like TLS's skb->decrypted marks that the data passed the digest
validation in the NIC. If we allow collapsing SKBs with mixed marks, we
will need to force software copy+crc verification. As TCP collapse is
indeed rare and the offload is opportunistic in nature, we can make this
change and submit another version, but I'm confused; why was it OK for
TLS, while it is not OK for DDP+CRC?
Eric Dumazet July 22, 2021, 1:39 p.m. UTC | #5
On Thu, Jul 22, 2021 at 3:33 PM Boris Pismenny <borispismenny@gmail.com> wrote:

> Sorry. My response above was about skb_condense which I've confused with
> tcp_collapse.
>
> In tcp_collapse, we could allow the copy, but the problem is CRC, which
> like TLS's skb->decrypted marks that the data passed the digest
> validation in the NIC. If we allow collapsing SKBs with mixed marks, we
> will need to force software copy+crc verification. As TCP collapse is
> indeed rare and the offload is opportunistic in nature, we can make this
> change and submit another version, but I'm confused; why was it OK for
> TLS, while it is not OK for DDP+CRC?
>

Ah.... I guess I was focused on the DDP part, while all your changes
are really about the CRC part.

Perhaps having an accessor to express the CRC status (and not be
confused by the DDP part)
 could help the intent of the code.
Boris Pismenny July 22, 2021, 2:02 p.m. UTC | #6
On 22/07/2021 16:39, Eric Dumazet wrote:
> On Thu, Jul 22, 2021 at 3:33 PM Boris Pismenny <borispismenny@gmail.com> wrote:
> 
>> Sorry. My response above was about skb_condense which I've confused with
>> tcp_collapse.
>>
>> In tcp_collapse, we could allow the copy, but the problem is CRC, which
>> like TLS's skb->decrypted marks that the data passed the digest
>> validation in the NIC. If we allow collapsing SKBs with mixed marks, we
>> will need to force software copy+crc verification. As TCP collapse is
>> indeed rare and the offload is opportunistic in nature, we can make this
>> change and submit another version, but I'm confused; why was it OK for
>> TLS, while it is not OK for DDP+CRC?
>>
> 
> Ah.... I guess I was focused on the DDP part, while all your changes
> are really about the CRC part.
> 
> Perhaps having an accessor to express the CRC status (and not be
> confused by the DDP part)
>  could help the intent of the code.
> 

An accessor function sounds like a great idea for readability, thanks Eric!

We will re-spin the series and add it to v6.
diff mbox series

Patch

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 2c6b9e416225..d9bd6ea26fc8 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -14,7 +14,7 @@  typedef u64 netdev_features_t;
 enum {
 	NETIF_F_SG_BIT,			/* Scatter/gather IO. */
 	NETIF_F_IP_CSUM_BIT,		/* Can checksum TCP/UDP over IPv4. */
-	__UNUSED_NETIF_F_1,
+	NETIF_F_HW_ULP_DDP_BIT,         /* ULP direct data placement offload */
 	NETIF_F_HW_CSUM_BIT,		/* Can checksum all the packets. */
 	NETIF_F_IPV6_CSUM_BIT,		/* Can checksum TCP/UDP over IPV6 */
 	NETIF_F_HIGHDMA_BIT,		/* Can DMA to high memory. */
@@ -168,6 +168,7 @@  enum {
 #define NETIF_F_HW_HSR_TAG_RM	__NETIF_F(HW_HSR_TAG_RM)
 #define NETIF_F_HW_HSR_FWD	__NETIF_F(HW_HSR_FWD)
 #define NETIF_F_HW_HSR_DUP	__NETIF_F(HW_HSR_DUP)
+#define NETIF_F_HW_ULP_DDP	__NETIF_F(HW_ULP_DDP)
 
 /* Finds the next feature with the highest number of the range of start till 0.
  */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eaf5bb008aa9..cba92c2dd9c0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1005,6 +1005,7 @@  struct dev_ifalias {
 
 struct devlink;
 struct tlsdev_ops;
+struct ulp_ddp_dev_ops;
 
 struct netdev_name_node {
 	struct hlist_node hlist;
@@ -2024,6 +2025,10 @@  struct net_device {
 	const struct tlsdev_ops *tlsdev_ops;
 #endif
 
+#if IS_ENABLED(CONFIG_ULP_DDP)
+	const struct ulp_ddp_dev_ops *ulp_ddp_ops;
+#endif
+
 	const struct header_ops *header_ops;
 
 	unsigned char		operstate;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b2db9cd9a73f..d323ecd37448 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -689,6 +689,7 @@  typedef unsigned char *sk_buff_data_t;
  *		CHECKSUM_UNNECESSARY (max 3)
  *	@dst_pending_confirm: need to confirm neighbour
  *	@decrypted: Decrypted SKB
+ *	@ddp_crc: DDP or CRC offloaded
  *	@napi_id: id of the NAPI struct this skb came from
  *	@sender_cpu: (aka @napi_id) source CPU in XPS
  *	@secmark: security marking
@@ -870,6 +871,9 @@  struct sk_buff {
 #ifdef CONFIG_TLS_DEVICE
 	__u8			decrypted:1;
 #endif
+#ifdef CONFIG_ULP_DDP
+	__u8                    ddp_crc:1;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b06c2d02ec84..66801ea72fb4 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -66,6 +66,8 @@  struct inet_connection_sock_af_ops {
  * @icsk_ulp_ops	   Pluggable ULP control hook
  * @icsk_ulp_data	   ULP private data
  * @icsk_clean_acked	   Clean acked data hook
+ * @icsk_ulp_ddp_ops	   Pluggable ULP direct data placement control hook
+ * @icsk_ulp_ddp_data	   ULP direct data placement private data
  * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
@@ -96,6 +98,8 @@  struct inet_connection_sock {
 	const struct tcp_ulp_ops  *icsk_ulp_ops;
 	void __rcu		  *icsk_ulp_data;
 	void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
+	const struct ulp_ddp_ulp_ops  *icsk_ulp_ddp_ops;
+	void __rcu		  *icsk_ulp_ddp_data;
 	struct hlist_node         icsk_listen_portaddr_node;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state:5,
diff --git a/include/net/ulp_ddp.h b/include/net/ulp_ddp.h
new file mode 100644
index 000000000000..1a0b464ff40b
--- /dev/null
+++ b/include/net/ulp_ddp.h
@@ -0,0 +1,136 @@ 
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * ulp_ddp.h
+ *	Author:	Boris Pismenny <borisp@mellanox.com>
+ *	Copyright (C) 2021 Mellanox Technologies.
+ */
+#ifndef _ULP_DDP_H
+#define _ULP_DDP_H
+
+#include <linux/netdevice.h>
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+
+/* limits returned by the offload driver, zero means don't care */
+struct ulp_ddp_limits {
+	int	 max_ddp_sgl_len;
+};
+
+enum ulp_ddp_type {
+	ULP_DDP_NVME = 1,
+};
+
+/**
+ * struct ulp_ddp_config - Generic ulp ddp configuration: tcp ddp IO queue
+ * config implementations must use this as the first member.
+ * Add new instances of ulp_ddp_config below (nvme-tcp, etc.).
+ */
+struct ulp_ddp_config {
+	enum ulp_ddp_type    type;
+	unsigned char        buf[];
+};
+
+/**
+ * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue
+ *
+ * @pfv:        pdu version (e.g., NVME_TCP_PFV_1_0)
+ * @cpda:       controller pdu data alignmend (dwords, 0's based)
+ * @dgst:       digest types enabled.
+ *              The netdev will offload crc if ddp_crc is supported.
+ * @queue_size: number of nvme-tcp IO queue elements
+ * @queue_id:   queue identifier
+ * @cpu_io:     cpu core running the IO thread for this queue
+ */
+struct nvme_tcp_ddp_config {
+	struct ulp_ddp_config   cfg;
+
+	u16			pfv;
+	u8			cpda;
+	u8			dgst;
+	int			queue_size;
+	int			queue_id;
+	int			io_cpu;
+};
+
+/**
+ * struct ulp_ddp_io - ulp ddp configuration for an IO request.
+ *
+ * @command_id:  identifier on the wire associated with these buffers
+ * @nents:       number of entries in the sg_table
+ * @sg_table:    describing the buffers for this IO request
+ * @first_sgl:   first SGL in sg_table
+ */
+struct ulp_ddp_io {
+	u32			command_id;
+	int			nents;
+	struct sg_table		sg_table;
+	struct scatterlist	first_sgl[SG_CHUNK_SIZE];
+};
+
+/* struct ulp_ddp_dev_ops - operations used by an upper layer protocol to configure ddp offload
+ *
+ * @ulp_ddp_limits:    limit the number of scatter gather entries per IO.
+ *                     the device driver can use this to limit the resources allocated per queue.
+ * @ulp_ddp_sk_add:    add offload for the queue represennted by the socket+config pair.
+ *                     this function is used to configure either copy, crc or both offloads.
+ * @ulp_ddp_sk_del:    remove offload from the socket, and release any device related resources.
+ * @ulp_ddp_setup:     request copy offload for buffers associated with a command_id in ulp_ddp_io.
+ * @ulp_ddp_teardown:  release offload resources association between buffers and command_id in
+ *                     ulp_ddp_io.
+ * @ulp_ddp_resync:    respond to the driver's resync_request. Called only if resync is successful.
+ */
+struct ulp_ddp_dev_ops {
+	int (*ulp_ddp_limits)(struct net_device *netdev,
+			      struct ulp_ddp_limits *limits);
+	int (*ulp_ddp_sk_add)(struct net_device *netdev,
+			      struct sock *sk,
+			      struct ulp_ddp_config *config);
+	void (*ulp_ddp_sk_del)(struct net_device *netdev,
+			       struct sock *sk);
+	int (*ulp_ddp_setup)(struct net_device *netdev,
+			     struct sock *sk,
+			     struct ulp_ddp_io *io);
+	int (*ulp_ddp_teardown)(struct net_device *netdev,
+				struct sock *sk,
+				struct ulp_ddp_io *io,
+				void *ddp_ctx);
+	void (*ulp_ddp_resync)(struct net_device *netdev,
+			       struct sock *sk, u32 seq);
+};
+
+#define ULP_DDP_RESYNC_REQ BIT(0)
+
+/**
+ * struct ulp_ddp_ulp_ops - Interface to register uppper layer Direct Data Placement (DDP) TCP offload
+ */
+struct ulp_ddp_ulp_ops {
+	/* NIC requests ulp to indicate if @seq is the start of a message */
+	bool (*resync_request)(struct sock *sk, u32 seq, u32 flags);
+	/* NIC driver informs the ulp that ddp teardown is done - used for async completions*/
+	void (*ddp_teardown_done)(void *ddp_ctx);
+};
+
+/**
+ * struct ulp_ddp_ctx - Generic ulp ddp context: device driver per queue contexts must
+ * use this as the first member.
+ */
+struct ulp_ddp_ctx {
+	enum ulp_ddp_type    type;
+	unsigned char        buf[];
+};
+
+static inline struct ulp_ddp_ctx *ulp_ddp_get_ctx(const struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	return (__force struct ulp_ddp_ctx *)icsk->icsk_ulp_ddp_data;
+}
+
+static inline void ulp_ddp_set_ctx(struct sock *sk, void *ctx)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx);
+}
+
+#endif //_ULP_DDP_H
diff --git a/net/Kconfig b/net/Kconfig
index c7392c449b25..b6f0ccbea1e3 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -454,4 +454,14 @@  config ETHTOOL_NETLINK
 	  netlink. It provides better extensibility and some new features,
 	  e.g. notification messages.
 
+config ULP_DDP
+	bool "ULP direct data placement offload"
+	default n
+	help
+	  Direct Data Placement (DDP) offload enables ULP, such as
+	  NVMe-TCP/iSCSI, to request the NIC to place ULP payload data
+	  of a command response directly into kernel pages while
+	  calculate/verify the data digest on ULP PDU as they go through
+	  the NIC. Thus avoiding the costly per-byte overhead.
+
 endif   # if NET
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 12aabcda6db2..20add6c3f2e6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -71,6 +71,7 @@ 
 #include <net/mpls.h>
 #include <net/mptcp.h>
 #include <net/page_pool.h>
+#include <net/ulp_ddp.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -6295,9 +6296,14 @@  EXPORT_SYMBOL(pskb_extract);
  */
 void skb_condense(struct sk_buff *skb)
 {
+	bool is_ddp = false;
+
+#ifdef CONFIG_ULP_DDP
+	is_ddp = skb->ddp_crc;
+#endif
 	if (skb->data_len) {
 		if (skb->data_len > skb->end - skb->tail ||
-		    skb_cloned(skb))
+		    skb_cloned(skb) || is_ddp)
 			return;
 
 		/* Nice, we can free page frag(s) right now */
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index f9dcbad84788..d545d1525800 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -73,6 +73,7 @@  const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
 	[NETIF_F_HW_HSR_TAG_RM_BIT] =	 "hsr-tag-rm-offload",
 	[NETIF_F_HW_HSR_FWD_BIT] =	 "hsr-fwd-offload",
 	[NETIF_F_HW_HSR_DUP_BIT] =	 "hsr-dup-offload",
+	[NETIF_F_HW_ULP_DDP_BIT] =	 "ulp-ddp-offload",
 };
 
 const char
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e6ca5a1f3b59..4a7160bba09b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5149,6 +5149,9 @@  tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 #ifdef CONFIG_TLS_DEVICE
 		nskb->decrypted = skb->decrypted;
+#endif
+#ifdef CONFIG_ULP_DDP
+		nskb->ddp_crc = skb->ddp_crc;
 #endif
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
 		if (list)
@@ -5182,6 +5185,11 @@  tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
 #ifdef CONFIG_TLS_DEVICE
 				if (skb->decrypted != nskb->decrypted)
 					goto end;
+#endif
+#ifdef CONFIG_ULP_DDP
+
+				if (skb->ddp_crc != nskb->ddp_crc)
+					goto end;
 #endif
 			}
 		}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e66ad6bfe808..3d9849a39b82 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1830,6 +1830,9 @@  bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
 #ifdef CONFIG_TLS_DEVICE
 	    tail->decrypted != skb->decrypted ||
+#endif
+#ifdef CONFIG_ULP_DDP
+	    tail->ddp_crc != skb->ddp_crc ||
 #endif
 	    thtail->doff != th->doff ||
 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index e09147ac9a99..96e8228d2b96 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -262,6 +262,9 @@  struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
 #ifdef CONFIG_TLS_DEVICE
 	flush |= p->decrypted ^ skb->decrypted;
 #endif
+#ifdef CONFIG_ULP_DDP
+	flush |= p->ddp_crc ^ skb->ddp_crc;
+#endif
 
 	if (flush || skb_gro_receive(p, skb)) {
 		mss = 1;