diff mbox series

[v3,15/55] ip, udp: Support MSG_SPLICE_PAGES

Message ID 20230331160914.1608208-16-dhowells@redhat.com (mailing list archive)
State New
Headers show
Series splice, net: Replace sendpage with sendmsg(MSG_SPLICE_PAGES) | expand

Commit Message

David Howells March 31, 2023, 4:08 p.m. UTC
Make IP/UDP sendmsg() support MSG_SPLICE_PAGES.  This causes pages to be
spliced from the source iterator.

This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
---
 net/ipv4/ip_output.c | 102 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 3 deletions(-)

Comments

Willem de Bruijn April 2, 2023, 3:10 p.m. UTC | #1
David Howells wrote:
> Make IP/UDP sendmsg() support MSG_SPLICE_PAGES.  This causes pages to be
> spliced from the source iterator.
> 
> This allows ->sendpage() to be replaced by something that can handle
> multiple multipage folios in a single transaction.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: netdev@vger.kernel.org
> ---
>  net/ipv4/ip_output.c | 102 +++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 99 insertions(+), 3 deletions(-)
> 
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 4e4e308c3230..e2eaba817c1f 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -956,6 +956,79 @@ csum_page(struct page *page, int offset, int copy)
>  	return csum;
>  }
>  
> +/*
> + * Allocate a packet for MSG_SPLICE_PAGES.
> + */
> +static int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb,
> +			     unsigned int fragheaderlen, unsigned int maxfraglen,
> +			     unsigned int hh_len)
> +{
> +	struct sk_buff *skb_prev = *pskb, *skb;
> +	unsigned int fraggap = skb_prev->len - maxfraglen;
> +	unsigned int alloclen = fragheaderlen + hh_len + fraggap + 15;
> +
> +	skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
> +	if (unlikely(!skb))
> +		return -ENOBUFS;
> +
> +	/* Fill in the control structures */
> +	skb->ip_summed = CHECKSUM_NONE;
> +	skb->csum = 0;
> +	skb_reserve(skb, hh_len);
> +
> +	/* Find where to start putting bytes. */
> +	skb_put(skb, fragheaderlen + fraggap);
> +	skb_reset_network_header(skb);
> +	skb->transport_header = skb->network_header + fragheaderlen;
> +	if (fraggap) {
> +		skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen,
> +						   skb_transport_header(skb),
> +						   fraggap);
> +		skb_prev->csum = csum_sub(skb_prev->csum, skb->csum);
> +		pskb_trim_unique(skb_prev, maxfraglen);
> +	}
> +
> +	/* Put the packet on the pending queue. */
> +	__skb_queue_tail(&sk->sk_write_queue, skb);
> +	*pskb = skb;
> +	return 0;
> +}
> +
> +/*
> + * Add (or copy) data pages for MSG_SPLICE_PAGES.
> + */
> +static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb,
> +			     void *from, int *pcopy)
> +{
> +	struct msghdr *msg = from;
> +	struct page *page = NULL, **pages = &page;
> +	ssize_t copy = *pcopy;
> +	size_t off;
> +	int err;
> +
> +	copy = iov_iter_extract_pages(&msg->msg_iter, &pages, copy, 1, 0, &off);
> +	if (copy <= 0)
> +		return copy ?: -EIO;
> +
> +	err = skb_append_pagefrags(skb, page, off, copy);
> +	if (err < 0) {
> +		iov_iter_revert(&msg->msg_iter, copy);
> +		return err;
> +	}
> +
> +	if (skb->ip_summed == CHECKSUM_NONE) {
> +		__wsum csum;
> +
> +		csum = csum_page(page, off, copy);
> +		skb->csum = csum_block_add(skb->csum, csum, skb->len);
> +	}
> +
> +	skb_len_add(skb, copy);
> +	refcount_add(copy, &sk->sk_wmem_alloc);
> +	*pcopy = copy;
> +	return 0;
> +}

These functions are derived from and replace ip_append_page.
That can be removed once udp_sendpage is converted?

>
>  static int __ip_append_data(struct sock *sk,
>  			    struct flowi4 *fl4,
>  			    struct sk_buff_head *queue,
> @@ -977,7 +1050,7 @@ static int __ip_append_data(struct sock *sk,
>  	int err;
>  	int offset = 0;
>  	bool zc = false;
> -	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
> +	unsigned int maxfraglen, fragheaderlen, maxnonfragsize, initial_length;
>  	int csummode = CHECKSUM_NONE;
>  	struct rtable *rt = (struct rtable *)cork->dst;
>  	unsigned int wmem_alloc_delta = 0;
> @@ -1017,6 +1090,7 @@ static int __ip_append_data(struct sock *sk,
>  	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
>  		csummode = CHECKSUM_PARTIAL;
>  
> +	initial_length = length;
>  	if ((flags & MSG_ZEROCOPY) && length) {
>  		struct msghdr *msg = from;
>  
> @@ -1047,6 +1121,14 @@ static int __ip_append_data(struct sock *sk,
>  				skb_zcopy_set(skb, uarg, &extra_uref);
>  			}
>  		}
> +	} else if ((flags & MSG_SPLICE_PAGES) && length) {
> +		if (inet->hdrincl)
> +			return -EPERM;
> +		if (rt->dst.dev->features & NETIF_F_SG)
> +			/* We need an empty buffer to attach stuff to */
> +			initial_length = transhdrlen;

I still don't entirely understand what initial_length means.

More importantly, transhdrlen can be zero. If not called for UDP
but for RAW. Or if this is a subsequent call to a packet that is
being held with MSG_MORE.

This works fine for existing use-cases, which go to alloc_new_skb.
Not sure how this case would be different. But the comment alludes
that it does.

> +		else
> +			flags &= ~MSG_SPLICE_PAGES;
>  	}
>  
>  	cork->length += length;
> @@ -1074,6 +1156,16 @@ static int __ip_append_data(struct sock *sk,
>  			unsigned int alloclen, alloc_extra;
>  			unsigned int pagedlen;
>  			struct sk_buff *skb_prev;
> +
> +			if (unlikely(flags & MSG_SPLICE_PAGES)) {
> +				err = __ip_splice_alloc(sk, &skb, fragheaderlen,
> +							maxfraglen, hh_len);
> +				if (err < 0)
> +					goto error;
> +				continue;
> +			}
> +			initial_length = length;
> +
>  alloc_new_skb:
>  			skb_prev = skb;
>  			if (skb_prev)
> @@ -1085,7 +1177,7 @@ static int __ip_append_data(struct sock *sk,
>  			 * If remaining data exceeds the mtu,
>  			 * we know we need more fragment(s).
>  			 */
> -			datalen = length + fraggap;
> +			datalen = initial_length + fraggap;
>  			if (datalen > mtu - fragheaderlen)
>  				datalen = maxfraglen - fragheaderlen;
>  			fraglen = datalen + fragheaderlen;
> @@ -1099,7 +1191,7 @@ static int __ip_append_data(struct sock *sk,
>  			 * because we have no idea what fragment will be
>  			 * the last.
>  			 */
> -			if (datalen == length + fraggap)
> +			if (datalen == initial_length + fraggap)
>  				alloc_extra += rt->dst.trailer_len;
>  
>  			if ((flags & MSG_MORE) &&
> @@ -1206,6 +1298,10 @@ static int __ip_append_data(struct sock *sk,
>  				err = -EFAULT;
>  				goto error;
>  			}
> +		} else if (flags & MSG_SPLICE_PAGES) {
> +			err = __ip_splice_pages(sk, skb, from, &copy);
> +			if (err < 0)
> +				goto error;
>  		} else if (!zc) {
>  			int i = skb_shinfo(skb)->nr_frags;
>  
>
David Howells April 3, 2023, 9:50 a.m. UTC | #2
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

> > +	} else if ((flags & MSG_SPLICE_PAGES) && length) {
> > +		if (inet->hdrincl)
> > +			return -EPERM;
> > +		if (rt->dst.dev->features & NETIF_F_SG)
> > +			/* We need an empty buffer to attach stuff to */
> > +			initial_length = transhdrlen;
> 
> I still don't entirely understand what initial_length means.
> 
> More importantly, transhdrlen can be zero. If not called for UDP
> but for RAW. Or if this is a subsequent call to a packet that is
> being held with MSG_MORE.
> 
> This works fine for existing use-cases, which go to alloc_new_skb.
> Not sure how this case would be different. But the comment alludes
> that it does.

The problem is that in the non-MSG_ZEROCOPY case, __ip_append_data() assumes
that it's going to copy the data it is given and will allocate sufficient
space in the skb in advance to hold it - but I don't want to do that because I
want to splice in the pages holding the data instead.  However, I do need to
allocate space to hold the transport header.

Maybe I should change 'initial_length' to 'initial_alloc'?  It represents the
amount I think we should allocate.  Or maybe I should have a separate
allocation clause for MSG_SPLICE_PAGES?

I also wonder if __ip_append_data() really needs two places that call
getfrag().

David
David Howells April 3, 2023, 11:18 a.m. UTC | #3
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

> These functions are derived from and replace ip_append_page.
> That can be removed once udp_sendpage is converted?

Yeah, looks like it.

David
Willem de Bruijn April 3, 2023, 1:46 p.m. UTC | #4
David Howells wrote:
> Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> 
> > > +	} else if ((flags & MSG_SPLICE_PAGES) && length) {
> > > +		if (inet->hdrincl)
> > > +			return -EPERM;
> > > +		if (rt->dst.dev->features & NETIF_F_SG)
> > > +			/* We need an empty buffer to attach stuff to */
> > > +			initial_length = transhdrlen;
> > 
> > I still don't entirely understand what initial_length means.
> > 
> > More importantly, transhdrlen can be zero. If not called for UDP
> > but for RAW. Or if this is a subsequent call to a packet that is
> > being held with MSG_MORE.
> > 
> > This works fine for existing use-cases, which go to alloc_new_skb.
> > Not sure how this case would be different. But the comment alludes
> > that it does.
> 
> The problem is that in the non-MSG_ZEROCOPY case, __ip_append_data() assumes
> that it's going to copy the data it is given and will allocate sufficient
> space in the skb in advance to hold it - but I don't want to do that because I
> want to splice in the pages holding the data instead.  However, I do need to
> allocate space to hold the transport header.
> 
> Maybe I should change 'initial_length' to 'initial_alloc'?  It represents the
> amount I think we should allocate.  Or maybe I should have a separate
> allocation clause for MSG_SPLICE_PAGES?

The code already has to avoid allocation in the MSG_ZEROCOPY case. I
added alloc_len and paged_len for that purpose.

Only the transhdrlen will be copied with getfrag due to

    copy = datalen - transhdrlen - fraggap - pagedlen

On next iteration in the loop, when remaining data fits in the skb,
there are three cases. The first is skipped due to !NETIF_F_SG. The
other two are either copy to page frags or zerocopy page frags.

I think your code should be able to fit in. Maybe easier if it could
reuse the existing alloc_new_skb code to copy the transport header, as
MSG_ZEROCOPY does, rather than adding a new __ip_splice_alloc branch
that short-circuits that. Then __ip_splice_pages also does not need
code to copy the initial header. But this is trickier. It's fine to
leave as is.

Since your code currently does call continue before executing the rest
of that branch, no need to modify any code there? Notably replacing
length with initial_length, which itself is initialized to length in
all cases expect for MSG_SPLICE_PAGES.

Just hardcode transhdrlen as the copy argument to __ip_splice_pages.
> I also wonder if __ip_append_data() really needs two places that call
> getfrag().
> 
> David
>
David Howells April 3, 2023, 10:04 p.m. UTC | #5
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

> The code already has to avoid allocation in the MSG_ZEROCOPY case. I
> added alloc_len and paged_len for that purpose.
> 
> Only the transhdrlen will be copied with getfrag due to
> 
>     copy = datalen - transhdrlen - fraggap - pagedlen
> 
> On next iteration in the loop, when remaining data fits in the skb,
> there are three cases. The first is skipped due to !NETIF_F_SG. The
> other two are either copy to page frags or zerocopy page frags.
> 
> I think your code should be able to fit in. Maybe easier if it could
> reuse the existing alloc_new_skb code to copy the transport header, as
> MSG_ZEROCOPY does, rather than adding a new __ip_splice_alloc branch
> that short-circuits that. Then __ip_splice_pages also does not need
> code to copy the initial header. But this is trickier. It's fine to
> leave as is.
> 
> Since your code currently does call continue before executing the rest
> of that branch, no need to modify any code there? Notably replacing
> length with initial_length, which itself is initialized to length in
> all cases expect for MSG_SPLICE_PAGES.

Okay.  How about the attached?  This seems to work.  Just setting "paged" to
true seems to do the right thing in __ip_append_data() when allocating /
setting up the skbuff, and then __ip_splice_pages() is called to add the
pages.

David
---
commit 9ac72c83407c8aef4be0c84513ec27bac9cfbcaa
Author: David Howells <dhowells@redhat.com>
Date:   Thu Mar 9 14:27:29 2023 +0000

    ip, udp: Support MSG_SPLICE_PAGES
    
    Make IP/UDP sendmsg() support MSG_SPLICE_PAGES.  This causes pages to be
    spliced from the source iterator.
    
    This allows ->sendpage() to be replaced by something that can handle
    multiple multipage folios in a single transaction.
    
    Signed-off-by: David Howells <dhowells@redhat.com>
    cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
    cc: "David S. Miller" <davem@davemloft.net>
    cc: Eric Dumazet <edumazet@google.com>
    cc: Jakub Kicinski <kuba@kernel.org>
    cc: Paolo Abeni <pabeni@redhat.com>
    cc: Jens Axboe <axboe@kernel.dk>
    cc: Matthew Wilcox <willy@infradead.org>
    cc: netdev@vger.kernel.org

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6109a86a8a4b..fe2e48874191 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -956,6 +956,41 @@ csum_page(struct page *page, int offset, int copy)
 	return csum;
 }
 
+/*
+ * Add (or copy) data pages for MSG_SPLICE_PAGES.
+ */
+static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb,
+			     void *from, int *pcopy)
+{
+	struct msghdr *msg = from;
+	struct page *page = NULL, **pages = &page;
+	ssize_t copy = *pcopy;
+	size_t off;
+	int err;
+
+	copy = iov_iter_extract_pages(&msg->msg_iter, &pages, copy, 1, 0, &off);
+	if (copy <= 0)
+		return copy ?: -EIO;
+
+	err = skb_append_pagefrags(skb, page, off, copy);
+	if (err < 0) {
+		iov_iter_revert(&msg->msg_iter, copy);
+		return err;
+	}
+
+	if (skb->ip_summed == CHECKSUM_NONE) {
+		__wsum csum;
+
+		csum = csum_page(page, off, copy);
+		skb->csum = csum_block_add(skb->csum, csum, skb->len);
+	}
+
+	skb_len_add(skb, copy);
+	refcount_add(copy, &sk->sk_wmem_alloc);
+	*pcopy = copy;
+	return 0;
+}
+
 static int __ip_append_data(struct sock *sk,
 			    struct flowi4 *fl4,
 			    struct sk_buff_head *queue,
@@ -1047,6 +1082,15 @@ static int __ip_append_data(struct sock *sk,
 				skb_zcopy_set(skb, uarg, &extra_uref);
 			}
 		}
+	} else if ((flags & MSG_SPLICE_PAGES) && length) {
+		if (inet->hdrincl)
+			return -EPERM;
+		if (rt->dst.dev->features & NETIF_F_SG) {
+			/* We need an empty buffer to attach stuff to */
+			paged = true;
+		} else {
+			flags &= ~MSG_SPLICE_PAGES;
+		}
 	}
 
 	cork->length += length;
@@ -1206,6 +1250,10 @@ static int __ip_append_data(struct sock *sk,
 				err = -EFAULT;
 				goto error;
 			}
+		} else if (flags & MSG_SPLICE_PAGES) {
+			err = __ip_splice_pages(sk, skb, from, &copy);
+			if (err < 0)
+				goto error;
 		} else if (!zc) {
 			int i = skb_shinfo(skb)->nr_frags;
Willem de Bruijn April 4, 2023, 4:58 p.m. UTC | #6
David Howells wrote:
> Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> 
> > The code already has to avoid allocation in the MSG_ZEROCOPY case. I
> > added alloc_len and paged_len for that purpose.
> > 
> > Only the transhdrlen will be copied with getfrag due to
> > 
> >     copy = datalen - transhdrlen - fraggap - pagedlen
> > 
> > On next iteration in the loop, when remaining data fits in the skb,
> > there are three cases. The first is skipped due to !NETIF_F_SG. The
> > other two are either copy to page frags or zerocopy page frags.
> > 
> > I think your code should be able to fit in. Maybe easier if it could
> > reuse the existing alloc_new_skb code to copy the transport header, as
> > MSG_ZEROCOPY does, rather than adding a new __ip_splice_alloc branch
> > that short-circuits that. Then __ip_splice_pages also does not need
> > code to copy the initial header. But this is trickier. It's fine to
> > leave as is.
> > 
> > Since your code currently does call continue before executing the rest
> > of that branch, no need to modify any code there? Notably replacing
> > length with initial_length, which itself is initialized to length in
> > all cases expect for MSG_SPLICE_PAGES.
> 
> Okay.  How about the attached?  This seems to work.  Just setting "paged" to
> true seems to do the right thing in __ip_append_data() when allocating /
> setting up the skbuff, and then __ip_splice_pages() is called to add the
> pages.

If this works, much preferred. Looks great to me.

As said, then __ip_splice_pages() probably no longer needs the
preamble to copy initial header bytes.

> David
> ---
> commit 9ac72c83407c8aef4be0c84513ec27bac9cfbcaa
> Author: David Howells <dhowells@redhat.com>
> Date:   Thu Mar 9 14:27:29 2023 +0000
> 
>     ip, udp: Support MSG_SPLICE_PAGES
>     
>     Make IP/UDP sendmsg() support MSG_SPLICE_PAGES.  This causes pages to be
>     spliced from the source iterator.
>     
>     This allows ->sendpage() to be replaced by something that can handle
>     multiple multipage folios in a single transaction.
>     
>     Signed-off-by: David Howells <dhowells@redhat.com>
>     cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
>     cc: "David S. Miller" <davem@davemloft.net>
>     cc: Eric Dumazet <edumazet@google.com>
>     cc: Jakub Kicinski <kuba@kernel.org>
>     cc: Paolo Abeni <pabeni@redhat.com>
>     cc: Jens Axboe <axboe@kernel.dk>
>     cc: Matthew Wilcox <willy@infradead.org>
>     cc: netdev@vger.kernel.org
> 
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 6109a86a8a4b..fe2e48874191 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -956,6 +956,41 @@ csum_page(struct page *page, int offset, int copy)
>  	return csum;
>  }
>  
> +/*
> + * Add (or copy) data pages for MSG_SPLICE_PAGES.
> + */
> +static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb,
> +			     void *from, int *pcopy)
> +{
> +	struct msghdr *msg = from;
> +	struct page *page = NULL, **pages = &page;
> +	ssize_t copy = *pcopy;
> +	size_t off;
> +	int err;
> +
> +	copy = iov_iter_extract_pages(&msg->msg_iter, &pages, copy, 1, 0, &off);
> +	if (copy <= 0)
> +		return copy ?: -EIO;
> +
> +	err = skb_append_pagefrags(skb, page, off, copy);
> +	if (err < 0) {
> +		iov_iter_revert(&msg->msg_iter, copy);
> +		return err;
> +	}
> +
> +	if (skb->ip_summed == CHECKSUM_NONE) {
> +		__wsum csum;
> +
> +		csum = csum_page(page, off, copy);
> +		skb->csum = csum_block_add(skb->csum, csum, skb->len);
> +	}
> +
> +	skb_len_add(skb, copy);
> +	refcount_add(copy, &sk->sk_wmem_alloc);
> +	*pcopy = copy;
> +	return 0;
> +}
> +
>  static int __ip_append_data(struct sock *sk,
>  			    struct flowi4 *fl4,
>  			    struct sk_buff_head *queue,
> @@ -1047,6 +1082,15 @@ static int __ip_append_data(struct sock *sk,
>  				skb_zcopy_set(skb, uarg, &extra_uref);
>  			}
>  		}
> +	} else if ((flags & MSG_SPLICE_PAGES) && length) {
> +		if (inet->hdrincl)
> +			return -EPERM;
> +		if (rt->dst.dev->features & NETIF_F_SG) {
> +			/* We need an empty buffer to attach stuff to */
> +			paged = true;
> +		} else {
> +			flags &= ~MSG_SPLICE_PAGES;
> +		}
>  	}
>  
>  	cork->length += length;
> @@ -1206,6 +1250,10 @@ static int __ip_append_data(struct sock *sk,
>  				err = -EFAULT;
>  				goto error;
>  			}
> +		} else if (flags & MSG_SPLICE_PAGES) {
> +			err = __ip_splice_pages(sk, skb, from, &copy);
> +			if (err < 0)
> +				goto error;
>  		} else if (!zc) {
>  			int i = skb_shinfo(skb)->nr_frags;
>  
>
David Howells April 4, 2023, 5:16 p.m. UTC | #7
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

> > Okay.  How about the attached?  This seems to work.  Just setting "paged" to
> > true seems to do the right thing in __ip_append_data() when allocating /
> > setting up the skbuff, and then __ip_splice_pages() is called to add the
> > pages.
> 
> If this works, much preferred. Looks great to me.

:-)

> As said, then __ip_splice_pages() probably no longer needs the
> preamble to copy initial header bytes.

Sorry, what?  It only attaches pages extracted from the iterator.

David
Willem de Bruijn April 4, 2023, 5:36 p.m. UTC | #8
David Howells wrote:
> Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> 
> > > Okay.  How about the attached?  This seems to work.  Just setting "paged" to
> > > true seems to do the right thing in __ip_append_data() when allocating /
> > > setting up the skbuff, and then __ip_splice_pages() is called to add the
> > > pages.
> > 
> > If this works, much preferred. Looks great to me.
> 
> :-)
> 
> > As said, then __ip_splice_pages() probably no longer needs the
> > preamble to copy initial header bytes.
> 
> Sorry, what?  It only attaches pages extracted from the iterator.

Ehm indeed. Never mind.
diff mbox series

Patch

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4e4e308c3230..e2eaba817c1f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -956,6 +956,79 @@  csum_page(struct page *page, int offset, int copy)
 	return csum;
 }
 
+/*
+ * Allocate a packet for MSG_SPLICE_PAGES.
+ */
+static int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb,
+			     unsigned int fragheaderlen, unsigned int maxfraglen,
+			     unsigned int hh_len)
+{
+	struct sk_buff *skb_prev = *pskb, *skb;
+	unsigned int fraggap = skb_prev->len - maxfraglen;
+	unsigned int alloclen = fragheaderlen + hh_len + fraggap + 15;
+
+	skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
+	if (unlikely(!skb))
+		return -ENOBUFS;
+
+	/* Fill in the control structures */
+	skb->ip_summed = CHECKSUM_NONE;
+	skb->csum = 0;
+	skb_reserve(skb, hh_len);
+
+	/* Find where to start putting bytes. */
+	skb_put(skb, fragheaderlen + fraggap);
+	skb_reset_network_header(skb);
+	skb->transport_header = skb->network_header + fragheaderlen;
+	if (fraggap) {
+		skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen,
+						   skb_transport_header(skb),
+						   fraggap);
+		skb_prev->csum = csum_sub(skb_prev->csum, skb->csum);
+		pskb_trim_unique(skb_prev, maxfraglen);
+	}
+
+	/* Put the packet on the pending queue. */
+	__skb_queue_tail(&sk->sk_write_queue, skb);
+	*pskb = skb;
+	return 0;
+}
+
+/*
+ * Add (or copy) data pages for MSG_SPLICE_PAGES.
+ */
+static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb,
+			     void *from, int *pcopy)
+{
+	struct msghdr *msg = from;
+	struct page *page = NULL, **pages = &page;
+	ssize_t copy = *pcopy;
+	size_t off;
+	int err;
+
+	copy = iov_iter_extract_pages(&msg->msg_iter, &pages, copy, 1, 0, &off);
+	if (copy <= 0)
+		return copy ?: -EIO;
+
+	err = skb_append_pagefrags(skb, page, off, copy);
+	if (err < 0) {
+		iov_iter_revert(&msg->msg_iter, copy);
+		return err;
+	}
+
+	if (skb->ip_summed == CHECKSUM_NONE) {
+		__wsum csum;
+
+		csum = csum_page(page, off, copy);
+		skb->csum = csum_block_add(skb->csum, csum, skb->len);
+	}
+
+	skb_len_add(skb, copy);
+	refcount_add(copy, &sk->sk_wmem_alloc);
+	*pcopy = copy;
+	return 0;
+}
+
 static int __ip_append_data(struct sock *sk,
 			    struct flowi4 *fl4,
 			    struct sk_buff_head *queue,
@@ -977,7 +1050,7 @@  static int __ip_append_data(struct sock *sk,
 	int err;
 	int offset = 0;
 	bool zc = false;
-	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
+	unsigned int maxfraglen, fragheaderlen, maxnonfragsize, initial_length;
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
@@ -1017,6 +1090,7 @@  static int __ip_append_data(struct sock *sk,
 	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
 		csummode = CHECKSUM_PARTIAL;
 
+	initial_length = length;
 	if ((flags & MSG_ZEROCOPY) && length) {
 		struct msghdr *msg = from;
 
@@ -1047,6 +1121,14 @@  static int __ip_append_data(struct sock *sk,
 				skb_zcopy_set(skb, uarg, &extra_uref);
 			}
 		}
+	} else if ((flags & MSG_SPLICE_PAGES) && length) {
+		if (inet->hdrincl)
+			return -EPERM;
+		if (rt->dst.dev->features & NETIF_F_SG)
+			/* We need an empty buffer to attach stuff to */
+			initial_length = transhdrlen;
+		else
+			flags &= ~MSG_SPLICE_PAGES;
 	}
 
 	cork->length += length;
@@ -1074,6 +1156,16 @@  static int __ip_append_data(struct sock *sk,
 			unsigned int alloclen, alloc_extra;
 			unsigned int pagedlen;
 			struct sk_buff *skb_prev;
+
+			if (unlikely(flags & MSG_SPLICE_PAGES)) {
+				err = __ip_splice_alloc(sk, &skb, fragheaderlen,
+							maxfraglen, hh_len);
+				if (err < 0)
+					goto error;
+				continue;
+			}
+			initial_length = length;
+
 alloc_new_skb:
 			skb_prev = skb;
 			if (skb_prev)
@@ -1085,7 +1177,7 @@  static int __ip_append_data(struct sock *sk,
 			 * If remaining data exceeds the mtu,
 			 * we know we need more fragment(s).
 			 */
-			datalen = length + fraggap;
+			datalen = initial_length + fraggap;
 			if (datalen > mtu - fragheaderlen)
 				datalen = maxfraglen - fragheaderlen;
 			fraglen = datalen + fragheaderlen;
@@ -1099,7 +1191,7 @@  static int __ip_append_data(struct sock *sk,
 			 * because we have no idea what fragment will be
 			 * the last.
 			 */
-			if (datalen == length + fraggap)
+			if (datalen == initial_length + fraggap)
 				alloc_extra += rt->dst.trailer_len;
 
 			if ((flags & MSG_MORE) &&
@@ -1206,6 +1298,10 @@  static int __ip_append_data(struct sock *sk,
 				err = -EFAULT;
 				goto error;
 			}
+		} else if (flags & MSG_SPLICE_PAGES) {
+			err = __ip_splice_pages(sk, skb, from, &copy);
+			if (err < 0)
+				goto error;
 		} else if (!zc) {
 			int i = skb_shinfo(skb)->nr_frags;