diff mbox series

[net-next,v5,07/14] ipv4, ipv6: Use splice_eof() to flush

Message ID 20230607140559.2263470-8-dhowells@redhat.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series splice, net: Rewrite splice-to-socket, fix SPLICE_F_MORE and handle MSG_SPLICE_PAGES in AF_TLS | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1062 this patch: 1062
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 127 this patch: 127
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1135 this patch: 1135
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 164 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

David Howells June 7, 2023, 2:05 p.m. UTC
Allow splice to undo the effects of MSG_MORE after prematurely ending a
splice/sendfile due to getting an EOF condition (->splice_read() returned
0) after splice had called sendmsg() with MSG_MORE set when the user didn't
set MSG_MORE.

For UDP, a pending packet will not be emitted if the socket is closed
before it is flushed; with this change, it be flushed by ->splice_eof().

For TCP, it's not clear that MSG_MORE is actually effective.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Eric Dumazet <edumazet@google.com>
cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
cc: David Ahern <dsahern@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
---
 include/net/inet_common.h |  1 +
 include/net/tcp.h         |  1 +
 include/net/udp.h         |  1 +
 net/ipv4/af_inet.c        | 18 ++++++++++++++++++
 net/ipv4/tcp.c            | 16 ++++++++++++++++
 net/ipv4/tcp_ipv4.c       |  1 +
 net/ipv4/udp.c            | 16 ++++++++++++++++
 net/ipv6/af_inet6.c       |  1 +
 net/ipv6/tcp_ipv6.c       |  1 +
 net/ipv6/udp.c            | 18 ++++++++++++++++++
 10 files changed, 74 insertions(+)

Comments

Kuniyuki Iwashima June 7, 2023, 3:32 p.m. UTC | #1
From: David Howells <dhowells@redhat.com>
Date: Wed,  7 Jun 2023 15:05:52 +0100
> Allow splice to undo the effects of MSG_MORE after prematurely ending a
> splice/sendfile due to getting an EOF condition (->splice_read() returned
> 0) after splice had called sendmsg() with MSG_MORE set when the user didn't
> set MSG_MORE.
> 
> For UDP, a pending packet will not be emitted if the socket is closed
> before it is flushed; with this change, it be flushed by ->splice_eof().
> 
> For TCP, it's not clear that MSG_MORE is actually effective.
> 
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> cc: David Ahern <dsahern@kernel.org>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: netdev@vger.kernel.org
> ---
>  include/net/inet_common.h |  1 +
>  include/net/tcp.h         |  1 +
>  include/net/udp.h         |  1 +
>  net/ipv4/af_inet.c        | 18 ++++++++++++++++++
>  net/ipv4/tcp.c            | 16 ++++++++++++++++
>  net/ipv4/tcp_ipv4.c       |  1 +
>  net/ipv4/udp.c            | 16 ++++++++++++++++
>  net/ipv6/af_inet6.c       |  1 +
>  net/ipv6/tcp_ipv6.c       |  1 +
>  net/ipv6/udp.c            | 18 ++++++++++++++++++
>  10 files changed, 74 insertions(+)
> 
> diff --git a/include/net/inet_common.h b/include/net/inet_common.h
> index 77f4b0ef5b92..a75333342c4e 100644
> --- a/include/net/inet_common.h
> +++ b/include/net/inet_common.h
> @@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock,
>  		   struct sock *newsk);
>  int inet_send_prepare(struct sock *sk);
>  int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
> +void inet_splice_eof(struct socket *sock);
>  ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
>  		      size_t size, int flags);
>  int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 68990a8f556a..49611af31bb7 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -327,6 +327,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
>  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
>  int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
>  			 size_t size, struct ubuf_info *uarg);
> +void tcp_splice_eof(struct socket *sock);
>  int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
>  		 int flags);
>  int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 5cad44318d71..4ed0b47c5582 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -278,6 +278,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
>  int udp_err(struct sk_buff *, u32);
>  int udp_abort(struct sock *sk, int err);
>  int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
> +void udp_splice_eof(struct socket *sock);
>  int udp_push_pending_frames(struct sock *sk);
>  void udp_flush_pending_frames(struct sock *sk);
>  int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index b5735b3551cf..6cfb78592836 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -831,6 +831,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
>  }
>  EXPORT_SYMBOL(inet_sendmsg);
>  
> +void inet_splice_eof(struct socket *sock)
> +{
> +	const struct proto *prot;
> +	struct sock *sk = sock->sk;
> +
> +	if (unlikely(inet_send_prepare(sk)))
> +		return;
> +
> +	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
> +	prot = READ_ONCE(sk->sk_prot);
> +	if (prot->splice_eof)
> +		sk->sk_prot->splice_eof(sock);

We need to use prot here.


> +}
> +EXPORT_SYMBOL_GPL(inet_splice_eof);
> +
>  ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
>  		      size_t size, int flags)
>  {
> @@ -1050,6 +1065,7 @@ const struct proto_ops inet_stream_ops = {
>  #ifdef CONFIG_MMU
>  	.mmap		   = tcp_mmap,
>  #endif
> +	.splice_eof	   = inet_splice_eof,
>  	.sendpage	   = inet_sendpage,
>  	.splice_read	   = tcp_splice_read,
>  	.read_sock	   = tcp_read_sock,
> @@ -1084,6 +1100,7 @@ const struct proto_ops inet_dgram_ops = {
>  	.read_skb	   = udp_read_skb,
>  	.recvmsg	   = inet_recvmsg,
>  	.mmap		   = sock_no_mmap,
> +	.splice_eof	   = inet_splice_eof,
>  	.sendpage	   = inet_sendpage,
>  	.set_peek_off	   = sk_set_peek_off,
>  #ifdef CONFIG_COMPAT
> @@ -1115,6 +1132,7 @@ static const struct proto_ops inet_sockraw_ops = {
>  	.sendmsg	   = inet_sendmsg,
>  	.recvmsg	   = inet_recvmsg,
>  	.mmap		   = sock_no_mmap,
> +	.splice_eof	   = inet_splice_eof,
>  	.sendpage	   = inet_sendpage,
>  #ifdef CONFIG_COMPAT
>  	.compat_ioctl	   = inet_compat_ioctl,
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 53b7751b68e1..09f03221a6f1 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1371,6 +1371,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
>  }
>  EXPORT_SYMBOL(tcp_sendmsg);
>  
> +void tcp_splice_eof(struct socket *sock)
> +{
> +	struct sock *sk = sock->sk;
> +	struct tcp_sock *tp = tcp_sk(sk);
> +	int mss_now, size_goal;
> +
> +	if (!tcp_write_queue_tail(sk))
> +		return;
> +
> +	lock_sock(sk);
> +	mss_now = tcp_send_mss(sk, &size_goal, 0);
> +	tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
> +	release_sock(sk);
> +}
> +EXPORT_SYMBOL_GPL(tcp_splice_eof);
> +
>  /*
>   *	Handle reading urgent data. BSD has very simple semantics for
>   *	this, no blocking and very strange errors 8)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 53e9ce2f05bb..84a5d557dc1a 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -3116,6 +3116,7 @@ struct proto tcp_prot = {
>  	.keepalive		= tcp_set_keepalive,
>  	.recvmsg		= tcp_recvmsg,
>  	.sendmsg		= tcp_sendmsg,
> +	.splice_eof		= tcp_splice_eof,
>  	.sendpage		= tcp_sendpage,
>  	.backlog_rcv		= tcp_v4_do_rcv,
>  	.release_cb		= tcp_release_cb,
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index fd3dae081f3a..df5e407286d7 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1324,6 +1324,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  }
>  EXPORT_SYMBOL(udp_sendmsg);
>  
> +void udp_splice_eof(struct socket *sock)
> +{
> +	struct sock *sk = sock->sk;
> +	struct udp_sock *up = udp_sk(sk);
> +
> +	if (!up->pending || READ_ONCE(up->corkflag))
> +		return;
> +
> +	lock_sock(sk);
> +	if (up->pending && !READ_ONCE(up->corkflag))
> +		udp_push_pending_frames(sk);
> +	release_sock(sk);
> +}
> +EXPORT_SYMBOL_GPL(udp_splice_eof);
> +
>  int udp_sendpage(struct sock *sk, struct page *page, int offset,
>  		 size_t size, int flags)
>  {
> @@ -2918,6 +2933,7 @@ struct proto udp_prot = {
>  	.getsockopt		= udp_getsockopt,
>  	.sendmsg		= udp_sendmsg,
>  	.recvmsg		= udp_recvmsg,
> +	.splice_eof		= udp_splice_eof,
>  	.sendpage		= udp_sendpage,
>  	.release_cb		= ip4_datagram_release_cb,
>  	.hash			= udp_lib_hash,
> diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> index 2bbf13216a3d..564942bee067 100644
> --- a/net/ipv6/af_inet6.c
> +++ b/net/ipv6/af_inet6.c
> @@ -695,6 +695,7 @@ const struct proto_ops inet6_stream_ops = {
>  #ifdef CONFIG_MMU
>  	.mmap		   = tcp_mmap,
>  #endif
> +	.splice_eof	   = inet_splice_eof,
>  	.sendpage	   = inet_sendpage,
>  	.sendmsg_locked    = tcp_sendmsg_locked,
>  	.sendpage_locked   = tcp_sendpage_locked,
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index d657713d1c71..c17c8ff94b79 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -2150,6 +2150,7 @@ struct proto tcpv6_prot = {
>  	.keepalive		= tcp_set_keepalive,
>  	.recvmsg		= tcp_recvmsg,
>  	.sendmsg		= tcp_sendmsg,
> +	.splice_eof		= tcp_splice_eof,
>  	.sendpage		= tcp_sendpage,
>  	.backlog_rcv		= tcp_v6_do_rcv,
>  	.release_cb		= tcp_release_cb,
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index e5a337e6b970..6c5975b13ae3 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1653,6 +1653,23 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  }
>  EXPORT_SYMBOL(udpv6_sendmsg);
>  
> +static void udpv6_splice_eof(struct socket *sock)
> +{
> +	struct sock *sk = sock->sk;
> +	struct udp_sock *up = udp_sk(sk);
> +
> +	if (!up->pending || READ_ONCE(up->corkflag))
> +		return;
> +
> +	if (up->pending == AF_INET)
> +		udp_splice_eof(sock);

Do we need this ?


> +
> +	lock_sock(sk);
> +	if (up->pending && !READ_ONCE(up->corkflag))
> +		udp_push_pending_frames(sk);

We should use udp_v6_push_pending_frames(sk) as up->pending
could be AF_INET even after the test above.


> +	release_sock(sk);
> +}
> +
>  void udpv6_destroy_sock(struct sock *sk)
>  {
>  	struct udp_sock *up = udp_sk(sk);
> @@ -1764,6 +1781,7 @@ struct proto udpv6_prot = {
>  	.getsockopt		= udpv6_getsockopt,
>  	.sendmsg		= udpv6_sendmsg,
>  	.recvmsg		= udpv6_recvmsg,
> +	.splice_eof		= udpv6_splice_eof,
>  	.release_cb		= ip6_datagram_release_cb,
>  	.hash			= udp_lib_hash,
>  	.unhash			= udp_lib_unhash,
David Howells June 7, 2023, 3:43 p.m. UTC | #2
Kuniyuki Iwashima <kuniyu@amazon.com> wrote:

> > +	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
> > +	prot = READ_ONCE(sk->sk_prot);
> > +	if (prot->splice_eof)
> > +		sk->sk_prot->splice_eof(sock);
> 
> We need to use prot here.

Yeah.

> > +	if (up->pending == AF_INET)
> > +		udp_splice_eof(sock);
> 
> Do we need this ?

Actually, no.  udp_v6_push_pending_frames() will do this.

> > +	lock_sock(sk);
> > +	if (up->pending && !READ_ONCE(up->corkflag))
> > +		udp_push_pending_frames(sk);
> 
> We should use udp_v6_push_pending_frames(sk) as up->pending
> could be AF_INET even after the test above.

Yeah.

Updated version attached for your perusal (I will post a v6 too).

David
---
commit 8b95b9cd654835eb2ff1ad24cd6de802836c4062
Author: David Howells <dhowells@redhat.com>
Date:   Wed Jun 7 14:44:34 2023 +0100

    ipv4, ipv6: Use splice_eof() to flush
    
    Allow splice to undo the effects of MSG_MORE after prematurely ending a
    splice/sendfile due to getting an EOF condition (->splice_read() returned
    0) after splice had called sendmsg() with MSG_MORE set when the user didn't
    set MSG_MORE.
    
    For UDP, a pending packet will not be emitted if the socket is closed
    before it is flushed; with this change, it be flushed by ->splice_eof().
    
    For TCP, it's not clear that MSG_MORE is actually effective.
    
    Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
    Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/
    Signed-off-by: David Howells <dhowells@redhat.com>
    cc: Kuniyuki Iwashima <kuniyu@amazon.com>
    cc: Eric Dumazet <edumazet@google.com>
    cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
    cc: David Ahern <dsahern@kernel.org>
    cc: "David S. Miller" <davem@davemloft.net>
    cc: Jakub Kicinski <kuba@kernel.org>
    cc: Paolo Abeni <pabeni@redhat.com>
    cc: Jens Axboe <axboe@kernel.dk>
    cc: Matthew Wilcox <willy@infradead.org>
    cc: netdev@vger.kernel.org

Notes:
    ver #6)
     - In inet_splice_eof(), use prot after deref of sk->sk_prot.
     - In udpv6_splice_eof(), use udp_v6_push_pending_frames().
     - In udpv6_splice_eof(), don't check for AF_INET.

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 77f4b0ef5b92..a75333342c4e 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock,
 		   struct sock *newsk);
 int inet_send_prepare(struct sock *sk);
 int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
+void inet_splice_eof(struct socket *sock);
 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		      size_t size, int flags);
 int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 68990a8f556a..49611af31bb7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -327,6 +327,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
 			 size_t size, struct ubuf_info *uarg);
+void tcp_splice_eof(struct socket *sock);
 int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 		 int flags);
 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
diff --git a/include/net/udp.h b/include/net/udp.h
index 5cad44318d71..4ed0b47c5582 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -278,6 +278,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
 int udp_err(struct sk_buff *, u32);
 int udp_abort(struct sock *sk, int err);
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+void udp_splice_eof(struct socket *sock);
 int udp_push_pending_frames(struct sock *sk);
 void udp_flush_pending_frames(struct sock *sk);
 int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b5735b3551cf..fd233c4195ac 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -831,6 +831,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 }
 EXPORT_SYMBOL(inet_sendmsg);
 
+void inet_splice_eof(struct socket *sock)
+{
+	const struct proto *prot;
+	struct sock *sk = sock->sk;
+
+	if (unlikely(inet_send_prepare(sk)))
+		return;
+
+	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot->splice_eof)
+		prot->splice_eof(sock);
+}
+EXPORT_SYMBOL_GPL(inet_splice_eof);
+
 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		      size_t size, int flags)
 {
@@ -1050,6 +1065,7 @@ const struct proto_ops inet_stream_ops = {
 #ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
 #endif
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
@@ -1084,6 +1100,7 @@ const struct proto_ops inet_dgram_ops = {
 	.read_skb	   = udp_read_skb,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
@@ -1115,6 +1132,7 @@ static const struct proto_ops inet_sockraw_ops = {
 	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	   = inet_compat_ioctl,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53b7751b68e1..09f03221a6f1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1371,6 +1371,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 }
 EXPORT_SYMBOL(tcp_sendmsg);
 
+void tcp_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+
+	if (!tcp_write_queue_tail(sk))
+		return;
+
+	lock_sock(sk);
+	mss_now = tcp_send_mss(sk, &size_goal, 0);
+	tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_splice_eof);
+
 /*
  *	Handle reading urgent data. BSD has very simple semantics for
  *	this, no blocking and very strange errors 8)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 53e9ce2f05bb..84a5d557dc1a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3116,6 +3116,7 @@ struct proto tcp_prot = {
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
+	.splice_eof		= tcp_splice_eof,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.release_cb		= tcp_release_cb,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fd3dae081f3a..df5e407286d7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1324,6 +1324,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 }
 EXPORT_SYMBOL(udp_sendmsg);
 
+void udp_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct udp_sock *up = udp_sk(sk);
+
+	if (!up->pending || READ_ONCE(up->corkflag))
+		return;
+
+	lock_sock(sk);
+	if (up->pending && !READ_ONCE(up->corkflag))
+		udp_push_pending_frames(sk);
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(udp_splice_eof);
+
 int udp_sendpage(struct sock *sk, struct page *page, int offset,
 		 size_t size, int flags)
 {
@@ -2918,6 +2933,7 @@ struct proto udp_prot = {
 	.getsockopt		= udp_getsockopt,
 	.sendmsg		= udp_sendmsg,
 	.recvmsg		= udp_recvmsg,
+	.splice_eof		= udp_splice_eof,
 	.sendpage		= udp_sendpage,
 	.release_cb		= ip4_datagram_release_cb,
 	.hash			= udp_lib_hash,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2bbf13216a3d..564942bee067 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -695,6 +695,7 @@ const struct proto_ops inet6_stream_ops = {
 #ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
 #endif
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d657713d1c71..c17c8ff94b79 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2150,6 +2150,7 @@ struct proto tcpv6_prot = {
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
+	.splice_eof		= tcp_splice_eof,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.release_cb		= tcp_release_cb,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e5a337e6b970..3a592dc129e9 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1653,6 +1653,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 }
 EXPORT_SYMBOL(udpv6_sendmsg);
 
+static void udpv6_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct udp_sock *up = udp_sk(sk);
+
+	if (!up->pending || READ_ONCE(up->corkflag))
+		return;
+
+	lock_sock(sk);
+	if (up->pending && !READ_ONCE(up->corkflag))
+		udp_push_pending_frames(sk);
+	release_sock(sk);
+}
+
 void udpv6_destroy_sock(struct sock *sk)
 {
 	struct udp_sock *up = udp_sk(sk);
@@ -1764,6 +1778,7 @@ struct proto udpv6_prot = {
 	.getsockopt		= udpv6_getsockopt,
 	.sendmsg		= udpv6_sendmsg,
 	.recvmsg		= udpv6_recvmsg,
+	.splice_eof		= udpv6_splice_eof,
 	.release_cb		= ip6_datagram_release_cb,
 	.hash			= udp_lib_hash,
 	.unhash			= udp_lib_unhash,
Kuniyuki Iwashima June 7, 2023, 3:54 p.m. UTC | #3
From: David Howells <dhowells@redhat.com>
Date: Wed, 07 Jun 2023 16:43:52 +0100
> Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> 
> > > +	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
> > > +	prot = READ_ONCE(sk->sk_prot);
> > > +	if (prot->splice_eof)
> > > +		sk->sk_prot->splice_eof(sock);
> > 
> > We need to use prot here.
> 
> Yeah.
> 
> > > +	if (up->pending == AF_INET)
> > > +		udp_splice_eof(sock);
> > 
> > Do we need this ?
> 
> Actually, no.  udp_v6_push_pending_frames() will do this.
> 
> > > +	lock_sock(sk);
> > > +	if (up->pending && !READ_ONCE(up->corkflag))
> > > +		udp_push_pending_frames(sk);
> > 
> > We should use udp_v6_push_pending_frames(sk) as up->pending
> > could be AF_INET even after the test above.
> 
> Yeah.
> 
> Updated version attached for your perusal (I will post a v6 too).
> 
> David
> ---
> commit 8b95b9cd654835eb2ff1ad24cd6de802836c4062
> Author: David Howells <dhowells@redhat.com>
> Date:   Wed Jun 7 14:44:34 2023 +0100
> 
>     ipv4, ipv6: Use splice_eof() to flush
>     
>     Allow splice to undo the effects of MSG_MORE after prematurely ending a
>     splice/sendfile due to getting an EOF condition (->splice_read() returned
>     0) after splice had called sendmsg() with MSG_MORE set when the user didn't
>     set MSG_MORE.
>     
>     For UDP, a pending packet will not be emitted if the socket is closed
>     before it is flushed; with this change, it be flushed by ->splice_eof().
>     
>     For TCP, it's not clear that MSG_MORE is actually effective.
>     
>     Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
>     Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/
>     Signed-off-by: David Howells <dhowells@redhat.com>
>     cc: Kuniyuki Iwashima <kuniyu@amazon.com>
>     cc: Eric Dumazet <edumazet@google.com>
>     cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
>     cc: David Ahern <dsahern@kernel.org>
>     cc: "David S. Miller" <davem@davemloft.net>
>     cc: Jakub Kicinski <kuba@kernel.org>
>     cc: Paolo Abeni <pabeni@redhat.com>
>     cc: Jens Axboe <axboe@kernel.dk>
>     cc: Matthew Wilcox <willy@infradead.org>
>     cc: netdev@vger.kernel.org
> 
> Notes:
>     ver #6)
>      - In inet_splice_eof(), use prot after deref of sk->sk_prot.
>      - In udpv6_splice_eof(), use udp_v6_push_pending_frames().

You missed this change ;)


>      - In udpv6_splice_eof(), don't check for AF_INET.
> 
> diff --git a/include/net/inet_common.h b/include/net/inet_common.h
> index 77f4b0ef5b92..a75333342c4e 100644
> --- a/include/net/inet_common.h
> +++ b/include/net/inet_common.h
> @@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock,
>  		   struct sock *newsk);
>  int inet_send_prepare(struct sock *sk);
>  int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
> +void inet_splice_eof(struct socket *sock);
>  ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
>  		      size_t size, int flags);
>  int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 68990a8f556a..49611af31bb7 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -327,6 +327,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
>  int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
>  int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
>  			 size_t size, struct ubuf_info *uarg);
> +void tcp_splice_eof(struct socket *sock);
>  int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
>  		 int flags);
>  int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 5cad44318d71..4ed0b47c5582 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -278,6 +278,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
>  int udp_err(struct sk_buff *, u32);
>  int udp_abort(struct sock *sk, int err);
>  int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
> +void udp_splice_eof(struct socket *sock);
>  int udp_push_pending_frames(struct sock *sk);
>  void udp_flush_pending_frames(struct sock *sk);
>  int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index b5735b3551cf..fd233c4195ac 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -831,6 +831,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
>  }
>  EXPORT_SYMBOL(inet_sendmsg);
>  
> +void inet_splice_eof(struct socket *sock)
> +{
> +	const struct proto *prot;
> +	struct sock *sk = sock->sk;
> +
> +	if (unlikely(inet_send_prepare(sk)))
> +		return;
> +
> +	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
> +	prot = READ_ONCE(sk->sk_prot);
> +	if (prot->splice_eof)
> +		prot->splice_eof(sock);
> +}
> +EXPORT_SYMBOL_GPL(inet_splice_eof);
> +
>  ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
>  		      size_t size, int flags)
>  {
> @@ -1050,6 +1065,7 @@ const struct proto_ops inet_stream_ops = {
>  #ifdef CONFIG_MMU
>  	.mmap		   = tcp_mmap,
>  #endif
> +	.splice_eof	   = inet_splice_eof,
>  	.sendpage	   = inet_sendpage,
>  	.splice_read	   = tcp_splice_read,
>  	.read_sock	   = tcp_read_sock,
> @@ -1084,6 +1100,7 @@ const struct proto_ops inet_dgram_ops = {
>  	.read_skb	   = udp_read_skb,
>  	.recvmsg	   = inet_recvmsg,
>  	.mmap		   = sock_no_mmap,
> +	.splice_eof	   = inet_splice_eof,
>  	.sendpage	   = inet_sendpage,
>  	.set_peek_off	   = sk_set_peek_off,
>  #ifdef CONFIG_COMPAT
> @@ -1115,6 +1132,7 @@ static const struct proto_ops inet_sockraw_ops = {
>  	.sendmsg	   = inet_sendmsg,
>  	.recvmsg	   = inet_recvmsg,
>  	.mmap		   = sock_no_mmap,
> +	.splice_eof	   = inet_splice_eof,
>  	.sendpage	   = inet_sendpage,
>  #ifdef CONFIG_COMPAT
>  	.compat_ioctl	   = inet_compat_ioctl,
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 53b7751b68e1..09f03221a6f1 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1371,6 +1371,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
>  }
>  EXPORT_SYMBOL(tcp_sendmsg);
>  
> +void tcp_splice_eof(struct socket *sock)
> +{
> +	struct sock *sk = sock->sk;
> +	struct tcp_sock *tp = tcp_sk(sk);
> +	int mss_now, size_goal;
> +
> +	if (!tcp_write_queue_tail(sk))
> +		return;
> +
> +	lock_sock(sk);
> +	mss_now = tcp_send_mss(sk, &size_goal, 0);
> +	tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
> +	release_sock(sk);
> +}
> +EXPORT_SYMBOL_GPL(tcp_splice_eof);
> +
>  /*
>   *	Handle reading urgent data. BSD has very simple semantics for
>   *	this, no blocking and very strange errors 8)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 53e9ce2f05bb..84a5d557dc1a 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -3116,6 +3116,7 @@ struct proto tcp_prot = {
>  	.keepalive		= tcp_set_keepalive,
>  	.recvmsg		= tcp_recvmsg,
>  	.sendmsg		= tcp_sendmsg,
> +	.splice_eof		= tcp_splice_eof,
>  	.sendpage		= tcp_sendpage,
>  	.backlog_rcv		= tcp_v4_do_rcv,
>  	.release_cb		= tcp_release_cb,
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index fd3dae081f3a..df5e407286d7 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1324,6 +1324,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  }
>  EXPORT_SYMBOL(udp_sendmsg);
>  
> +void udp_splice_eof(struct socket *sock)
> +{
> +	struct sock *sk = sock->sk;
> +	struct udp_sock *up = udp_sk(sk);
> +
> +	if (!up->pending || READ_ONCE(up->corkflag))
> +		return;
> +
> +	lock_sock(sk);
> +	if (up->pending && !READ_ONCE(up->corkflag))
> +		udp_push_pending_frames(sk);
> +	release_sock(sk);
> +}
> +EXPORT_SYMBOL_GPL(udp_splice_eof);
> +
>  int udp_sendpage(struct sock *sk, struct page *page, int offset,
>  		 size_t size, int flags)
>  {
> @@ -2918,6 +2933,7 @@ struct proto udp_prot = {
>  	.getsockopt		= udp_getsockopt,
>  	.sendmsg		= udp_sendmsg,
>  	.recvmsg		= udp_recvmsg,
> +	.splice_eof		= udp_splice_eof,
>  	.sendpage		= udp_sendpage,
>  	.release_cb		= ip4_datagram_release_cb,
>  	.hash			= udp_lib_hash,
> diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> index 2bbf13216a3d..564942bee067 100644
> --- a/net/ipv6/af_inet6.c
> +++ b/net/ipv6/af_inet6.c
> @@ -695,6 +695,7 @@ const struct proto_ops inet6_stream_ops = {
>  #ifdef CONFIG_MMU
>  	.mmap		   = tcp_mmap,
>  #endif
> +	.splice_eof	   = inet_splice_eof,
>  	.sendpage	   = inet_sendpage,
>  	.sendmsg_locked    = tcp_sendmsg_locked,
>  	.sendpage_locked   = tcp_sendpage_locked,
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index d657713d1c71..c17c8ff94b79 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -2150,6 +2150,7 @@ struct proto tcpv6_prot = {
>  	.keepalive		= tcp_set_keepalive,
>  	.recvmsg		= tcp_recvmsg,
>  	.sendmsg		= tcp_sendmsg,
> +	.splice_eof		= tcp_splice_eof,
>  	.sendpage		= tcp_sendpage,
>  	.backlog_rcv		= tcp_v6_do_rcv,
>  	.release_cb		= tcp_release_cb,
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index e5a337e6b970..3a592dc129e9 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1653,6 +1653,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  }
>  EXPORT_SYMBOL(udpv6_sendmsg);
>  
> +static void udpv6_splice_eof(struct socket *sock)
> +{
> +	struct sock *sk = sock->sk;
> +	struct udp_sock *up = udp_sk(sk);
> +
> +	if (!up->pending || READ_ONCE(up->corkflag))
> +		return;
> +
> +	lock_sock(sk);
> +	if (up->pending && !READ_ONCE(up->corkflag))
> +		udp_push_pending_frames(sk);
> +	release_sock(sk);
> +}
> +
>  void udpv6_destroy_sock(struct sock *sk)
>  {
>  	struct udp_sock *up = udp_sk(sk);
> @@ -1764,6 +1778,7 @@ struct proto udpv6_prot = {
>  	.getsockopt		= udpv6_getsockopt,
>  	.sendmsg		= udpv6_sendmsg,
>  	.recvmsg		= udpv6_recvmsg,
> +	.splice_eof		= udpv6_splice_eof,
>  	.release_cb		= ip6_datagram_release_cb,
>  	.hash			= udp_lib_hash,
>  	.unhash			= udp_lib_unhash,
David Howells June 7, 2023, 4:01 p.m. UTC | #4
Kuniyuki Iwashima <kuniyu@amazon.com> wrote:

> >      - In udpv6_splice_eof(), use udp_v6_push_pending_frames().
> 
> You missed this change ;)

No I didn't - I just forgot to save the buffer :-/

David
---
commit a630e96e3b1073dc39fd370d60ccb3d5367ce9e6
Author: David Howells <dhowells@redhat.com>
Date:   Wed Jun 7 14:44:34 2023 +0100

    ipv4, ipv6: Use splice_eof() to flush
    
    Allow splice to undo the effects of MSG_MORE after prematurely ending a
    splice/sendfile due to getting an EOF condition (->splice_read() returned
    0) after splice had called sendmsg() with MSG_MORE set when the user didn't
    set MSG_MORE.
    
    For UDP, a pending packet will not be emitted if the socket is closed
    before it is flushed; with this change, it be flushed by ->splice_eof().
    
    For TCP, it's not clear that MSG_MORE is actually effective.
    
    Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
    Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/
    Signed-off-by: David Howells <dhowells@redhat.com>
    cc: Kuniyuki Iwashima <kuniyu@amazon.com>
    cc: Eric Dumazet <edumazet@google.com>
    cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
    cc: David Ahern <dsahern@kernel.org>
    cc: "David S. Miller" <davem@davemloft.net>
    cc: Jakub Kicinski <kuba@kernel.org>
    cc: Paolo Abeni <pabeni@redhat.com>
    cc: Jens Axboe <axboe@kernel.dk>
    cc: Matthew Wilcox <willy@infradead.org>
    cc: netdev@vger.kernel.org

Notes:
    ver #6)
     - In inet_splice_eof(), use prot after deref of sk->sk_prot.
     - In udpv6_splice_eof(), use udp_v6_push_pending_frames().
     - In udpv6_splice_eof(), don't check for AF_INET.

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 77f4b0ef5b92..a75333342c4e 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock,
 		   struct sock *newsk);
 int inet_send_prepare(struct sock *sk);
 int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
+void inet_splice_eof(struct socket *sock);
 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		      size_t size, int flags);
 int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 68990a8f556a..49611af31bb7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -327,6 +327,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
 			 size_t size, struct ubuf_info *uarg);
+void tcp_splice_eof(struct socket *sock);
 int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 		 int flags);
 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
diff --git a/include/net/udp.h b/include/net/udp.h
index 5cad44318d71..4ed0b47c5582 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -278,6 +278,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
 int udp_err(struct sk_buff *, u32);
 int udp_abort(struct sock *sk, int err);
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+void udp_splice_eof(struct socket *sock);
 int udp_push_pending_frames(struct sock *sk);
 void udp_flush_pending_frames(struct sock *sk);
 int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b5735b3551cf..fd233c4195ac 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -831,6 +831,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 }
 EXPORT_SYMBOL(inet_sendmsg);
 
+void inet_splice_eof(struct socket *sock)
+{
+	const struct proto *prot;
+	struct sock *sk = sock->sk;
+
+	if (unlikely(inet_send_prepare(sk)))
+		return;
+
+	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot->splice_eof)
+		prot->splice_eof(sock);
+}
+EXPORT_SYMBOL_GPL(inet_splice_eof);
+
 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		      size_t size, int flags)
 {
@@ -1050,6 +1065,7 @@ const struct proto_ops inet_stream_ops = {
 #ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
 #endif
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
@@ -1084,6 +1100,7 @@ const struct proto_ops inet_dgram_ops = {
 	.read_skb	   = udp_read_skb,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
@@ -1115,6 +1132,7 @@ static const struct proto_ops inet_sockraw_ops = {
 	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	   = inet_compat_ioctl,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53b7751b68e1..09f03221a6f1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1371,6 +1371,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 }
 EXPORT_SYMBOL(tcp_sendmsg);
 
+void tcp_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+
+	if (!tcp_write_queue_tail(sk))
+		return;
+
+	lock_sock(sk);
+	mss_now = tcp_send_mss(sk, &size_goal, 0);
+	tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_splice_eof);
+
 /*
  *	Handle reading urgent data. BSD has very simple semantics for
  *	this, no blocking and very strange errors 8)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 53e9ce2f05bb..84a5d557dc1a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3116,6 +3116,7 @@ struct proto tcp_prot = {
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
+	.splice_eof		= tcp_splice_eof,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.release_cb		= tcp_release_cb,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fd3dae081f3a..df5e407286d7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1324,6 +1324,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 }
 EXPORT_SYMBOL(udp_sendmsg);
 
+void udp_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct udp_sock *up = udp_sk(sk);
+
+	if (!up->pending || READ_ONCE(up->corkflag))
+		return;
+
+	lock_sock(sk);
+	if (up->pending && !READ_ONCE(up->corkflag))
+		udp_push_pending_frames(sk);
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(udp_splice_eof);
+
 int udp_sendpage(struct sock *sk, struct page *page, int offset,
 		 size_t size, int flags)
 {
@@ -2918,6 +2933,7 @@ struct proto udp_prot = {
 	.getsockopt		= udp_getsockopt,
 	.sendmsg		= udp_sendmsg,
 	.recvmsg		= udp_recvmsg,
+	.splice_eof		= udp_splice_eof,
 	.sendpage		= udp_sendpage,
 	.release_cb		= ip4_datagram_release_cb,
 	.hash			= udp_lib_hash,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2bbf13216a3d..564942bee067 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -695,6 +695,7 @@ const struct proto_ops inet6_stream_ops = {
 #ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
 #endif
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d657713d1c71..c17c8ff94b79 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2150,6 +2150,7 @@ struct proto tcpv6_prot = {
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
+	.splice_eof		= tcp_splice_eof,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.release_cb		= tcp_release_cb,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e5a337e6b970..317b01c9bc39 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1653,6 +1653,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 }
 EXPORT_SYMBOL(udpv6_sendmsg);
 
+static void udpv6_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct udp_sock *up = udp_sk(sk);
+
+	if (!up->pending || READ_ONCE(up->corkflag))
+		return;
+
+	lock_sock(sk);
+	if (up->pending && !READ_ONCE(up->corkflag))
+		udp_v6_push_pending_frames(sk);
+	release_sock(sk);
+}
+
 void udpv6_destroy_sock(struct sock *sk)
 {
 	struct udp_sock *up = udp_sk(sk);
@@ -1764,6 +1778,7 @@ struct proto udpv6_prot = {
 	.getsockopt		= udpv6_getsockopt,
 	.sendmsg		= udpv6_sendmsg,
 	.recvmsg		= udpv6_recvmsg,
+	.splice_eof		= udpv6_splice_eof,
 	.release_cb		= ip6_datagram_release_cb,
 	.hash			= udp_lib_hash,
 	.unhash			= udp_lib_unhash,
diff mbox series

Patch

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 77f4b0ef5b92..a75333342c4e 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -35,6 +35,7 @@  void __inet_accept(struct socket *sock, struct socket *newsock,
 		   struct sock *newsk);
 int inet_send_prepare(struct sock *sk);
 int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
+void inet_splice_eof(struct socket *sock);
 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		      size_t size, int flags);
 int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 68990a8f556a..49611af31bb7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -327,6 +327,7 @@  int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
 			 size_t size, struct ubuf_info *uarg);
+void tcp_splice_eof(struct socket *sock);
 int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 		 int flags);
 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
diff --git a/include/net/udp.h b/include/net/udp.h
index 5cad44318d71..4ed0b47c5582 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -278,6 +278,7 @@  int udp_get_port(struct sock *sk, unsigned short snum,
 int udp_err(struct sk_buff *, u32);
 int udp_abort(struct sock *sk, int err);
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+void udp_splice_eof(struct socket *sock);
 int udp_push_pending_frames(struct sock *sk);
 void udp_flush_pending_frames(struct sock *sk);
 int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b5735b3551cf..6cfb78592836 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -831,6 +831,21 @@  int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 }
 EXPORT_SYMBOL(inet_sendmsg);
 
+void inet_splice_eof(struct socket *sock)
+{
+	const struct proto *prot;
+	struct sock *sk = sock->sk;
+
+	if (unlikely(inet_send_prepare(sk)))
+		return;
+
+	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot->splice_eof)
+		sk->sk_prot->splice_eof(sock);
+}
+EXPORT_SYMBOL_GPL(inet_splice_eof);
+
 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		      size_t size, int flags)
 {
@@ -1050,6 +1065,7 @@  const struct proto_ops inet_stream_ops = {
 #ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
 #endif
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
@@ -1084,6 +1100,7 @@  const struct proto_ops inet_dgram_ops = {
 	.read_skb	   = udp_read_skb,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
@@ -1115,6 +1132,7 @@  static const struct proto_ops inet_sockraw_ops = {
 	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	   = inet_compat_ioctl,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53b7751b68e1..09f03221a6f1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1371,6 +1371,22 @@  int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 }
 EXPORT_SYMBOL(tcp_sendmsg);
 
+void tcp_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+
+	if (!tcp_write_queue_tail(sk))
+		return;
+
+	lock_sock(sk);
+	mss_now = tcp_send_mss(sk, &size_goal, 0);
+	tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_splice_eof);
+
 /*
  *	Handle reading urgent data. BSD has very simple semantics for
  *	this, no blocking and very strange errors 8)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 53e9ce2f05bb..84a5d557dc1a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3116,6 +3116,7 @@  struct proto tcp_prot = {
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
+	.splice_eof		= tcp_splice_eof,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.release_cb		= tcp_release_cb,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fd3dae081f3a..df5e407286d7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1324,6 +1324,21 @@  int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 }
 EXPORT_SYMBOL(udp_sendmsg);
 
+void udp_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct udp_sock *up = udp_sk(sk);
+
+	if (!up->pending || READ_ONCE(up->corkflag))
+		return;
+
+	lock_sock(sk);
+	if (up->pending && !READ_ONCE(up->corkflag))
+		udp_push_pending_frames(sk);
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(udp_splice_eof);
+
 int udp_sendpage(struct sock *sk, struct page *page, int offset,
 		 size_t size, int flags)
 {
@@ -2918,6 +2933,7 @@  struct proto udp_prot = {
 	.getsockopt		= udp_getsockopt,
 	.sendmsg		= udp_sendmsg,
 	.recvmsg		= udp_recvmsg,
+	.splice_eof		= udp_splice_eof,
 	.sendpage		= udp_sendpage,
 	.release_cb		= ip4_datagram_release_cb,
 	.hash			= udp_lib_hash,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2bbf13216a3d..564942bee067 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -695,6 +695,7 @@  const struct proto_ops inet6_stream_ops = {
 #ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
 #endif
+	.splice_eof	   = inet_splice_eof,
 	.sendpage	   = inet_sendpage,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d657713d1c71..c17c8ff94b79 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2150,6 +2150,7 @@  struct proto tcpv6_prot = {
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
+	.splice_eof		= tcp_splice_eof,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.release_cb		= tcp_release_cb,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e5a337e6b970..6c5975b13ae3 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1653,6 +1653,23 @@  int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 }
 EXPORT_SYMBOL(udpv6_sendmsg);
 
+static void udpv6_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct udp_sock *up = udp_sk(sk);
+
+	if (!up->pending || READ_ONCE(up->corkflag))
+		return;
+
+	if (up->pending == AF_INET)
+		udp_splice_eof(sock);
+
+	lock_sock(sk);
+	if (up->pending && !READ_ONCE(up->corkflag))
+		udp_push_pending_frames(sk);
+	release_sock(sk);
+}
+
 void udpv6_destroy_sock(struct sock *sk)
 {
 	struct udp_sock *up = udp_sk(sk);
@@ -1764,6 +1781,7 @@  struct proto udpv6_prot = {
 	.getsockopt		= udpv6_getsockopt,
 	.sendmsg		= udpv6_sendmsg,
 	.recvmsg		= udpv6_recvmsg,
+	.splice_eof		= udpv6_splice_eof,
 	.release_cb		= ip6_datagram_release_cb,
 	.hash			= udp_lib_hash,
 	.unhash			= udp_lib_unhash,