diff mbox series

[bpf,v3,1/4] af_unix: Disable MSG_OOB handling for sockets in sockmap/sockhash

Message ID 20240707222842.4119416-2-mhal@rbox.co (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series af_unix: MSG_OOB handling fix & selftest | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 45 this patch: 45
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang fail Errors and warnings before: 36 this patch: 36
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn fail Errors and warnings before: 49 this patch: 49
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 59 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Commit Message

Michal Luczaj July 7, 2024, 9:28 p.m. UTC
AF_UNIX socket tracks the most recent OOB packet (in its receive queue)
with an `oob_skb` pointer. BPF redirecting does not account for that: when
an OOB packet is moved between sockets, `oob_skb` is left outdated. This
results in a single skb that may be accessed from two different sockets.

Take the easy way out: silently drop MSG_OOB data targeting any socket that
is in a sockmap or a sockhash. Note that such silent drop is akin to the
fate of redirected skb's scm_fp_list (SCM_RIGHTS, SCM_CREDENTIALS).

For symmetry, forbid MSG_OOB in unix_bpf_recvmsg().

Suggested-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Fixes: 314001f0bf92 ("af_unix: Add OOB support")
Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
 net/unix/af_unix.c  | 41 ++++++++++++++++++++++++++++++++++++++++-
 net/unix/unix_bpf.c |  3 +++
 2 files changed, 43 insertions(+), 1 deletion(-)

Comments

Kuniyuki Iwashima July 8, 2024, 7:38 p.m. UTC | #1
From: Michal Luczaj <mhal@rbox.co>
Date: Sun,  7 Jul 2024 23:28:22 +0200
> AF_UNIX socket tracks the most recent OOB packet (in its receive queue)
> with an `oob_skb` pointer. BPF redirecting does not account for that: when
> an OOB packet is moved between sockets, `oob_skb` is left outdated. This
> results in a single skb that may be accessed from two different sockets.
> 
> Take the easy way out: silently drop MSG_OOB data targeting any socket that
> is in a sockmap or a sockhash. Note that such silent drop is akin to the
> fate of redirected skb's scm_fp_list (SCM_RIGHTS, SCM_CREDENTIALS).
> 
> For symmetry, forbid MSG_OOB in unix_bpf_recvmsg().
> 
> Suggested-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> Fixes: 314001f0bf92 ("af_unix: Add OOB support")
> Signed-off-by: Michal Luczaj <mhal@rbox.co>

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>

Thanks!


> ---
>  net/unix/af_unix.c  | 41 ++++++++++++++++++++++++++++++++++++++++-
>  net/unix/unix_bpf.c |  3 +++
>  2 files changed, 43 insertions(+), 1 deletion(-)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 142f56770b77..11cb5badafb6 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -2667,10 +2667,49 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
>  
>  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
>  {
> +	struct unix_sock *u = unix_sk(sk);
> +	struct sk_buff *skb;
> +	int err;
> +
>  	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
>  		return -ENOTCONN;
>  
> -	return unix_read_skb(sk, recv_actor);
> +	mutex_lock(&u->iolock);
> +	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
> +	mutex_unlock(&u->iolock);
> +	if (!skb)
> +		return err;
> +
> +#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
> +	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
> +		bool drop = false;
> +
> +		unix_state_lock(sk);
> +
> +		if (sock_flag(sk, SOCK_DEAD)) {
> +			unix_state_unlock(sk);
> +			kfree_skb(skb);
> +			return -ECONNRESET;
> +		}
> +
> +		spin_lock(&sk->sk_receive_queue.lock);
> +		if (likely(skb == u->oob_skb)) {
> +			WRITE_ONCE(u->oob_skb, NULL);
> +			drop = true;
> +		}
> +		spin_unlock(&sk->sk_receive_queue.lock);
> +
> +		unix_state_unlock(sk);
> +
> +		if (drop) {
> +			WARN_ON_ONCE(skb_unref(skb));
> +			kfree_skb(skb);
> +			return -EAGAIN;
> +		}
> +	}
> +#endif
> +
> +	return recv_actor(sk, skb);
>  }
>  
>  static int unix_stream_read_generic(struct unix_stream_read_state *state,
> diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
> index bd84785bf8d6..bca2d86ba97d 100644
> --- a/net/unix/unix_bpf.c
> +++ b/net/unix/unix_bpf.c
> @@ -54,6 +54,9 @@ static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
>  	struct sk_psock *psock;
>  	int copied;
>  
> +	if (flags & MSG_OOB)
> +		return -EOPNOTSUPP;
> +
>  	if (!len)
>  		return 0;
>  
> -- 
> 2.45.2
John Fastabend July 9, 2024, 1:24 a.m. UTC | #2
Kuniyuki Iwashima wrote:
> From: Michal Luczaj <mhal@rbox.co>
> Date: Sun,  7 Jul 2024 23:28:22 +0200
> > AF_UNIX socket tracks the most recent OOB packet (in its receive queue)
> > with an `oob_skb` pointer. BPF redirecting does not account for that: when
> > an OOB packet is moved between sockets, `oob_skb` is left outdated. This
> > results in a single skb that may be accessed from two different sockets.
> > 
> > Take the easy way out: silently drop MSG_OOB data targeting any socket that
> > is in a sockmap or a sockhash. Note that such silent drop is akin to the
> > fate of redirected skb's scm_fp_list (SCM_RIGHTS, SCM_CREDENTIALS).
> > 
> > For symmetry, forbid MSG_OOB in unix_bpf_recvmsg().
> > 
> > Suggested-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > Fixes: 314001f0bf92 ("af_unix: Add OOB support")
> > Signed-off-by: Michal Luczaj <mhal@rbox.co>
> 

Why does af_unix put the oob data on the sk_receive_queue()? Wouldn't it
be enough to just have the ousk->oob_skb hold the reference to the skb?

I think for TCP/UDP at least I'll want to handle MSG_OOB data correctly.
For redirect its probably fine to just drop or skip it, but when we are
just reading recv msgs and parsing/observing it would be nice to not change
how the application works. In practice I don't recall anyone reporting
issues on TCP side though from incorrectly handling URG data.

From TCP side I believe we can fix the OOB case by checking the oob queue
before doing the recvmsg handling. If the urg data wasn't on the general
sk_receive_queue we could do similar here for af_unix? My argument for
URG not working for redirect would be to let userspace handle it if they
cared.

Thanks.

> Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> 
> Thanks!
> 
> 
> > ---
> >  net/unix/af_unix.c  | 41 ++++++++++++++++++++++++++++++++++++++++-
> >  net/unix/unix_bpf.c |  3 +++
> >  2 files changed, 43 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> > index 142f56770b77..11cb5badafb6 100644
> > --- a/net/unix/af_unix.c
> > +++ b/net/unix/af_unix.c
> > @@ -2667,10 +2667,49 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
> >  
> >  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
> >  {
> > +	struct unix_sock *u = unix_sk(sk);
> > +	struct sk_buff *skb;
> > +	int err;
> > +
> >  	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
> >  		return -ENOTCONN;
> >  
> > -	return unix_read_skb(sk, recv_actor);
> > +	mutex_lock(&u->iolock);
> > +	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
> > +	mutex_unlock(&u->iolock);
> > +	if (!skb)
> > +		return err;
> > +
> > +#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
> > +	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
> > +		bool drop = false;
> > +
> > +		unix_state_lock(sk);
> > +
> > +		if (sock_flag(sk, SOCK_DEAD)) {
> > +			unix_state_unlock(sk);
> > +			kfree_skb(skb);
> > +			return -ECONNRESET;
> > +		}
> > +
> > +		spin_lock(&sk->sk_receive_queue.lock);
> > +		if (likely(skb == u->oob_skb)) {
> > +			WRITE_ONCE(u->oob_skb, NULL);
> > +			drop = true;
> > +		}
> > +		spin_unlock(&sk->sk_receive_queue.lock);
> > +
> > +		unix_state_unlock(sk);
> > +
> > +		if (drop) {
> > +			WARN_ON_ONCE(skb_unref(skb));
> > +			kfree_skb(skb);
> > +			return -EAGAIN;
> > +		}
> > +	}
> > +#endif
> > +
> > +	return recv_actor(sk, skb);
> >  }
> >  
> >  static int unix_stream_read_generic(struct unix_stream_read_state *state,
> > diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
> > index bd84785bf8d6..bca2d86ba97d 100644
> > --- a/net/unix/unix_bpf.c
> > +++ b/net/unix/unix_bpf.c
> > @@ -54,6 +54,9 @@ static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
> >  	struct sk_psock *psock;
> >  	int copied;
> >  
> > +	if (flags & MSG_OOB)
> > +		return -EOPNOTSUPP;
> > +
> >  	if (!len)
> >  		return 0;
> >  
> > -- 
> > 2.45.2
Kuniyuki Iwashima July 9, 2024, 2:18 a.m. UTC | #3
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 08 Jul 2024 18:24:02 -0700
> Kuniyuki Iwashima wrote:
> > From: Michal Luczaj <mhal@rbox.co>
> > Date: Sun,  7 Jul 2024 23:28:22 +0200
> > > AF_UNIX socket tracks the most recent OOB packet (in its receive queue)
> > > with an `oob_skb` pointer. BPF redirecting does not account for that: when
> > > an OOB packet is moved between sockets, `oob_skb` is left outdated. This
> > > results in a single skb that may be accessed from two different sockets.
> > > 
> > > Take the easy way out: silently drop MSG_OOB data targeting any socket that
> > > is in a sockmap or a sockhash. Note that such silent drop is akin to the
> > > fate of redirected skb's scm_fp_list (SCM_RIGHTS, SCM_CREDENTIALS).
> > > 
> > > For symmetry, forbid MSG_OOB in unix_bpf_recvmsg().
> > > 
> > > Suggested-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > Fixes: 314001f0bf92 ("af_unix: Add OOB support")
> > > Signed-off-by: Michal Luczaj <mhal@rbox.co>
> > 
> 
> Why does af_unix put the oob data on the sk_receive_queue()? Wouldn't it
> be enough to just have the ousk->oob_skb hold the reference to the skb?

We need to remember when oob_skb was sent for some reasons:

  1. OOB data must stop recv() at the point
  2. ioctl(SIOCATMARK) must return true if OOB data is at the head of
     the recvq regardless that the data is already consumed or not

  See tools/testing/selftests/net/af_unix/msg_oob.c

And actually TCP has OOB data in the normal skb ququed in recvq.

TCP also holds the copied data in tp->urg_data and SEQ# in tp->urg_seq.
Later when recv() passes through the OOB SEQ#, the recv() is stopped at
OOB data.

Note that the implementation has some bugs, e.g. it doesn't have a flag
to indicate if each OOB data is consumed, and we can recv() the stale
OOB data twice.

(CCed Takamitsu who is working on the URG bugs on the TCP side)


> I think for TCP/UDP at least I'll want to handle MSG_OOB data correctly.

UDP doesn't support MSG_OOB.


> For redirect its probably fine to just drop or skip it, but when we are
> just reading recv msgs and parsing/observing it would be nice to not change
> how the application works. In practice I don't recall anyone reporting
> issues on TCP side though from incorrectly handling URG data.

IIUC, the normal read case is processed by tcp_recvmsg(), right ?
Then, OOB data is handled at the found_ok_skb/skip_copy: labels.


> 
> From TCP side I believe we can fix the OOB case by checking the oob queue
> before doing the recvmsg handling. If the urg data wasn't on the general
> sk_receive_queue we could do similar here for af_unix? My argument for
> URG not working for redirect would be to let userspace handle it if they
> cared.

For the redirect cse, in tcp_read_skb(), we need to check tp->urg_data and
tp->{copied,urg}_seq and clear them if needed as done in tcp_recvmsg().
Jakub Sitnicki July 9, 2024, 9:48 a.m. UTC | #4
On Sun, Jul 07, 2024 at 11:28 PM +02, Michal Luczaj wrote:
> AF_UNIX socket tracks the most recent OOB packet (in its receive queue)
> with an `oob_skb` pointer. BPF redirecting does not account for that: when
> an OOB packet is moved between sockets, `oob_skb` is left outdated. This
> results in a single skb that may be accessed from two different sockets.
>
> Take the easy way out: silently drop MSG_OOB data targeting any socket that
> is in a sockmap or a sockhash. Note that such silent drop is akin to the
> fate of redirected skb's scm_fp_list (SCM_RIGHTS, SCM_CREDENTIALS).
>
> For symmetry, forbid MSG_OOB in unix_bpf_recvmsg().
>
> Suggested-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> Fixes: 314001f0bf92 ("af_unix: Add OOB support")
> Signed-off-by: Michal Luczaj <mhal@rbox.co>
> ---

Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
diff mbox series

Patch

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 142f56770b77..11cb5badafb6 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2667,10 +2667,49 @@  static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
 
 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
+	struct unix_sock *u = unix_sk(sk);
+	struct sk_buff *skb;
+	int err;
+
 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
 		return -ENOTCONN;
 
-	return unix_read_skb(sk, recv_actor);
+	mutex_lock(&u->iolock);
+	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
+	mutex_unlock(&u->iolock);
+	if (!skb)
+		return err;
+
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
+		bool drop = false;
+
+		unix_state_lock(sk);
+
+		if (sock_flag(sk, SOCK_DEAD)) {
+			unix_state_unlock(sk);
+			kfree_skb(skb);
+			return -ECONNRESET;
+		}
+
+		spin_lock(&sk->sk_receive_queue.lock);
+		if (likely(skb == u->oob_skb)) {
+			WRITE_ONCE(u->oob_skb, NULL);
+			drop = true;
+		}
+		spin_unlock(&sk->sk_receive_queue.lock);
+
+		unix_state_unlock(sk);
+
+		if (drop) {
+			WARN_ON_ONCE(skb_unref(skb));
+			kfree_skb(skb);
+			return -EAGAIN;
+		}
+	}
+#endif
+
+	return recv_actor(sk, skb);
 }
 
 static int unix_stream_read_generic(struct unix_stream_read_state *state,
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index bd84785bf8d6..bca2d86ba97d 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -54,6 +54,9 @@  static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
 	struct sk_psock *psock;
 	int copied;
 
+	if (flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
 	if (!len)
 		return 0;