diff mbox series

[bpf,2/3] net: poll psock queues too for sockmap sockets

Message ID 20210924220507.24543-3-xiyou.wangcong@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series sock_map: fix ->poll() and update selftests | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present fail Series targets non-next tree, but doesn't contain any Fixes tags
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf
netdev/subject_prefix success Link
netdev/cc_maintainers warning 16 maintainers not CCed: jiang.wang@bytedance.com davem@davemloft.net kpsingh@kernel.org Rao.Shoaib@oracle.com yhs@fb.com kuniyu@amazon.co.jp songliubraving@fb.com yoshfuji@linux-ipv6.org edumazet@google.com kuba@kernel.org christian.brauner@ubuntu.com andrii@kernel.org kafai@fb.com viro@zeniv.linux.org.uk dsahern@kernel.org ast@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 79 this patch: 79
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 89 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 79 this patch: 79
netdev/header_inline success Link
bpf/vmtest-bpf fail VM_Test
bpf/vmtest-bpf-PR fail PR summary

Commit Message

Cong Wang Sept. 24, 2021, 10:05 p.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

Yucong noticed we can't poll() sockets in sockmap even
when they are the destination sockets of redirections.
This is because we never poll any psock queues in ->poll().
We can not overwrite ->poll() as it is in struct proto_ops,
not in struct proto.

So introduce sk_msg_poll() to poll psock ingress_msg queue
and let sockets which support sockmap invoke it directly.

Reported-by: Yucong Sun <sunyucong@gmail.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/skmsg.h |  6 ++++++
 net/core/skmsg.c      | 15 +++++++++++++++
 net/ipv4/tcp.c        |  2 ++
 net/ipv4/udp.c        |  2 ++
 net/unix/af_unix.c    |  5 +++++
 5 files changed, 30 insertions(+)

Comments

John Fastabend Sept. 27, 2021, 6:07 p.m. UTC | #1
Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
> 
> Yucong noticed we can't poll() sockets in sockmap even
> when they are the destination sockets of redirections.
> This is because we never poll any psock queues in ->poll().
> We can not overwrite ->poll() as it is in struct proto_ops,
> not in struct proto.
> 
> So introduce sk_msg_poll() to poll psock ingress_msg queue
> and let sockets which support sockmap invoke it directly.
> 
> Reported-by: Yucong Sun <sunyucong@gmail.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Cc: Lorenz Bauer <lmb@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
>  include/linux/skmsg.h |  6 ++++++
>  net/core/skmsg.c      | 15 +++++++++++++++
>  net/ipv4/tcp.c        |  2 ++
>  net/ipv4/udp.c        |  2 ++
>  net/unix/af_unix.c    |  5 +++++
>  5 files changed, 30 insertions(+)
> 

[...]
  						  struct sk_buff *skb)
>  {
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index e8b48df73c85..2eb1a87ba056 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -280,6 +280,7 @@
>  #include <linux/uaccess.h>
>  #include <asm/ioctls.h>
>  #include <net/busy_poll.h>
> +#include <linux/skmsg.h>
>  
>  /* Track pending CMSGs. */
>  enum {
> @@ -563,6 +564,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
>  
>  		if (tcp_stream_is_readable(sk, target))
>  			mask |= EPOLLIN | EPOLLRDNORM;
> +		mask |= sk_msg_poll(sk);
>  
>  		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
>  			if (__sk_stream_is_writeable(sk, 1)) {


For TCP we implement the stream_memory_read() hook which we implement in
tcp_bpf.c with tcp_bpf_stream_read. This just checks psock->ingress_msg
list which should cover any redirect from skmsg into the ingress side
of another socket.

And the tcp_poll logic is using tcp_stream_is_readable() which is 
checking for sk->sk_prot->stream_memory_read() and then calling it.

The straight receive path, e.g. not redirected from a sender should
be covered by the normal tcp_epollin_ready() checks because this
would be after TCP does the normal updates to rcv_nxt, copied_seq,
etc.

So above is not in the TCP case by my reading. Did I miss a
case? We also have done tests with Envoy which I thought were polling
so I'll check on that as well.

Thanks,
John
Cong Wang Sept. 27, 2021, 7:29 p.m. UTC | #2
On Mon, Sep 27, 2021 at 11:07 AM John Fastabend
<john.fastabend@gmail.com> wrote:
>
> Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > Yucong noticed we can't poll() sockets in sockmap even
> > when they are the destination sockets of redirections.
> > This is because we never poll any psock queues in ->poll().
> > We can not overwrite ->poll() as it is in struct proto_ops,
> > not in struct proto.
> >
> > So introduce sk_msg_poll() to poll psock ingress_msg queue
> > and let sockets which support sockmap invoke it directly.
> >
> > Reported-by: Yucong Sun <sunyucong@gmail.com>
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
> >  include/linux/skmsg.h |  6 ++++++
> >  net/core/skmsg.c      | 15 +++++++++++++++
> >  net/ipv4/tcp.c        |  2 ++
> >  net/ipv4/udp.c        |  2 ++
> >  net/unix/af_unix.c    |  5 +++++
> >  5 files changed, 30 insertions(+)
> >
>
> [...]
>                                                   struct sk_buff *skb)
> >  {
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index e8b48df73c85..2eb1a87ba056 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -280,6 +280,7 @@
> >  #include <linux/uaccess.h>
> >  #include <asm/ioctls.h>
> >  #include <net/busy_poll.h>
> > +#include <linux/skmsg.h>
> >
> >  /* Track pending CMSGs. */
> >  enum {
> > @@ -563,6 +564,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
> >
> >               if (tcp_stream_is_readable(sk, target))
> >                       mask |= EPOLLIN | EPOLLRDNORM;
> > +             mask |= sk_msg_poll(sk);
> >
> >               if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
> >                       if (__sk_stream_is_writeable(sk, 1)) {
>
>
> For TCP we implement the stream_memory_read() hook which we implement in
> tcp_bpf.c with tcp_bpf_stream_read. This just checks psock->ingress_msg
> list which should cover any redirect from skmsg into the ingress side
> of another socket.
>
> And the tcp_poll logic is using tcp_stream_is_readable() which is
> checking for sk->sk_prot->stream_memory_read() and then calling it.

Ah, I missed it. It is better to have such a hook in struct proto,
since we just can overwrite it with bpf hooks. Let me rename it
for non-TCP and implement it for UDP and AF_UNIX too.

>
> The straight receive path, e.g. not redirected from a sender should
> be covered by the normal tcp_epollin_ready() checks because this
> would be after TCP does the normal updates to rcv_nxt, copied_seq,
> etc.

Yes.

>
> So above is not in the TCP case by my reading. Did I miss a
> case? We also have done tests with Envoy which I thought were polling
> so I'll check on that as well.

Right, all of these selftests in patch 3/3 are non-TCP.

Thanks.
diff mbox series

Patch

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index d47097f2c8c0..163b0cc1703a 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -128,6 +128,7 @@  int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 			     struct sk_msg *msg, u32 bytes);
 int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 		   int len, int flags);
+__poll_t sk_msg_poll(struct sock *sk);
 
 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
 {
@@ -562,5 +563,10 @@  static inline void skb_bpf_redirect_clear(struct sk_buff *skb)
 {
 	skb->_sk_redir = 0;
 }
+#else
+static inline __poll_t sk_msg_poll(struct sock *sk)
+{
+	return 0;
+}
 #endif /* CONFIG_NET_SOCK_MSG */
 #endif /* _LINUX_SKMSG_H */
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 2d6249b28928..8e6d7ea43eca 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -474,6 +474,21 @@  int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 }
 EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
 
+__poll_t sk_msg_poll(struct sock *sk)
+{
+	struct sk_psock *psock;
+	__poll_t mask = 0;
+
+	psock = sk_psock_get_checked(sk);
+	if (IS_ERR_OR_NULL(psock))
+		return 0;
+	if (!sk_psock_queue_empty(psock))
+		mask |= EPOLLIN | EPOLLRDNORM;
+	sk_psock_put(sk, psock);
+	return mask;
+}
+EXPORT_SYMBOL_GPL(sk_msg_poll);
+
 static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
 						  struct sk_buff *skb)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..2eb1a87ba056 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -280,6 +280,7 @@ 
 #include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
+#include <linux/skmsg.h>
 
 /* Track pending CMSGs. */
 enum {
@@ -563,6 +564,7 @@  __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 		if (tcp_stream_is_readable(sk, target))
 			mask |= EPOLLIN | EPOLLRDNORM;
+		mask |= sk_msg_poll(sk);
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 			if (__sk_stream_is_writeable(sk, 1)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8851c9463b4b..fbc989d27388 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -97,6 +97,7 @@ 
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/skmsg.h>
 #include <net/net_namespace.h>
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -2866,6 +2867,7 @@  __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
 		mask &= ~(EPOLLIN | EPOLLRDNORM);
 
+	mask |= sk_msg_poll(sk);
 	return mask;
 
 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 92345c9bb60c..5d705541d082 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -114,6 +114,7 @@ 
 #include <linux/freezer.h>
 #include <linux/file.h>
 #include <linux/btf_ids.h>
+#include <linux/skmsg.h>
 
 #include "scm.h"
 
@@ -3015,6 +3016,8 @@  static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
+	mask |= sk_msg_poll(sk);
+
 	/* Connection-based need to check for termination and startup */
 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
 	    sk->sk_state == TCP_CLOSE)
@@ -3054,6 +3057,8 @@  static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
+	mask |= sk_msg_poll(sk);
+
 	/* Connection-based need to check for termination and startup */
 	if (sk->sk_type == SOCK_SEQPACKET) {
 		if (sk->sk_state == TCP_CLOSE)